diff --git a/CMakeLists.txt b/CMakeLists.txt index 69aa0e6..8882479 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,9 +28,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/${CMAKE_SYSTEM_PROCES set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/exe/${CMAKE_SYSTEM_PROCESSOR}/${SYMCRYPT_TARGET_ENV}) if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode") - # Set DBG=1 and enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in - # the toolchain file - add_compile_options(-DDBG=1) + # Enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in the + # toolchain file enable_language(ASM_MASM) add_compile_options(/MP) # Remove /RTC1, incompatible of /Ox @@ -43,16 +42,23 @@ if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode") string( REPLACE "/Od" "" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) string( REPLACE "/Od" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) string( REPLACE "/Od" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) - - IF(CMAKE_BUILD_TYPE MATCHES Release) - message("Release mode") + + if(CMAKE_BUILD_TYPE MATCHES Release) add_compile_options(/Oxs) - ENDIF(CMAKE_BUILD_TYPE MATCHES Release) + endif() elseif(NOT WIN32) enable_language(ASM) add_compile_options(-Wno-deprecated-declarations -Wno-deprecated) add_compile_options(-g) add_compile_options(-Wno-multichar) + add_compile_options(-fPIC) +endif() + +if(CMAKE_BUILD_TYPE MATCHES Release) + message("Release mode") +else() + message("Debug mode") + add_compile_options(-DDBG=1) endif() include_directories(inc) diff --git a/README.md b/README.md index 65fe375..b14c0f6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Introduction +# Introduction SymCrypt is the core cryptographic function library currently used by Windows. ## History @@ -30,20 +30,23 @@ or gcc 7.4.0 on Linux. Note that CMake ships with Visual Studio 2019. 4. Configure CMake compilation: * For 32-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-x86.cmake -A Win32` * For 64-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-amd64.cmake` - * For Linux (or Windows with no CPU optimizations): `cmake ..` + * For 64-bit Linux targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/linux-amd64.cmake` + * For no CPU optimizations: `cmake ..` + * Optionally, for a release build, specify `-DCMAKE_BUILD_TYPE=Release` 5. `cmake --build .` + * Optionally specify -jN where N is the number of processes you wish to spawn for the build If compilation succeeds, the output will be put in the `exe` subdirectory relative to where compilation occurred (i.e. `bin/exe` if you followed the instructions above). The SymCrypt unit test is in the `unittest` directory. It runs extensive functional tests on the SymCrypt library. On Windows it also compares results against on other implementations such as the Windows APIs CNG -and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides +and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides detailed performance information. # Security Bugs If you believe you have found a problem that affects the security of this code, please do **NOT** create an issue -or pull request, but instead email your comments to secure@microsoft.com. +or pull request, but instead email your comments to secure@microsoft.com. # Contribute We love to receive comments and suggestions. Unfortunately we cannot accept external code contributions at this time. diff --git a/cmake-toolchain/linux-amd64.cmake b/cmake-toolchain/linux-amd64.cmake index 4af573d..6e4e3a1 100644 --- a/cmake-toolchain/linux-amd64.cmake +++ b/cmake-toolchain/linux-amd64.cmake @@ -10,7 +10,6 @@ set(SYMCRYPT_TARGET_ENV Linux) # Define _AMD64_ to set up the correct SymCrypt macros, e.g. SYMCRYPT_CPU_AMD64 add_compile_options(-D_AMD64_) -add_compile_options(-DDBG) add_compile_options(-O3) # Enable a baseline of features for the compiler to support everywhere diff --git a/inc/C_asm_shared.inc b/inc/C_asm_shared.inc index 0057c3e..7a7e326 100644 --- a/inc/C_asm_shared.inc +++ b/inc/C_asm_shared.inc @@ -1,70 +1,43 @@ -;/* -; C_asm_shared.inc file to synchronize C and Asm information -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. +/* + C_asm_shared.inc file to synchronize C and Asm information + Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; This is a file that compiles both in C and ASM to define values in a way that is guaranteed to be the same on both sides. -; We use this to define the structure offsets that the ASM code uses. -; By having equivalent C constants we can add checks to the C code to ensure they are correct. -; -; This is an ugly hack, but it works :-) -; -; Due to the fact that the ARM assemblers use the C precompiler -; the C files have to redefine EQU to nothing before including this file. -; */ + This is a file that is included in both C and ASM such that the values are the same on both sides. + We use the C preprocessor to set ASM constants, as we already need to use the C preprocessor for + symcryptasm processing (see scripts/symcryptasm_processor.py). + We use this to define the structure offsets that the ASM code uses. + By having equivalent C constants we can add checks to the C code to ensure they are correct. +*/ -;const SIZE_T -SymCryptModulusNdigitsOffsetAmd64 EQU 4; +#if defined(SYMCRYPT_MASM) +#define SET(_variable, _value) _variable EQU _value +#elif defined(SYMCRYPT_GAS) +#define SET(_variable, _value) .set _variable, _value +#else // assume C +#define SET(_variable, _value) const SIZE_T _variable = _value; +#endif -; const SIZE_T -SymCryptModulusMontgomeryInv64OffsetAmd64 EQU 32; +SET(SymCryptModulusNdigitsOffsetAmd64, 4); +SET(SymCryptModulusMontgomeryInv64OffsetAmd64, 32); +SET(SymCryptModulusValueOffsetAmd64, 128); -; const SIZE_T -SymCryptModulusValueOffsetAmd64 EQU 128; +SET(SymCryptModulusNdigitsOffsetX86, 4); +SET(SymCryptModulusMontgomeryInv64OffsetX86, 24); +SET(SymCryptModulusValueOffsetX86, 96); +SET(SymCryptModulusNdigitsOffsetArm64, 4); +SET(SymCryptModulusMontgomeryInv64OffsetArm64, 32); +SET(SymCryptModulusValueOffsetArm64, 128); +SET(SymCryptModulusNdigitsOffsetArm, 4); +SET(SymCryptModulusMontgomeryInv64OffsetArm, 24); +SET(SymCryptModulusValueOffsetArm, 96); - -;const SIZE_T -SymCryptModulusNdigitsOffsetX86 EQU 4; - -; const SIZE_T -SymCryptModulusMontgomeryInv64OffsetX86 EQU 24; - -; const SIZE_T -SymCryptModulusValueOffsetX86 EQU 96; - - - - -;const SIZE_T -SymCryptModulusNdigitsOffsetArm64 EQU 4; - -; const SIZE_T -SymCryptModulusMontgomeryInv64OffsetArm64 EQU 32; - -; const SIZE_T -SymCryptModulusValueOffsetArm64 EQU 128; - - - - -;const SIZE_T -SymCryptModulusNdigitsOffsetArm EQU 4; - -; const SIZE_T -SymCryptModulusMontgomeryInv64OffsetArm EQU 24; - -; const SIZE_T -SymCryptModulusValueOffsetArm EQU 96; - - - - -; /* - IF 0 -; */ -#undef EQU +#if !defined(SYMCRYPT_MASM) && !defined(SYMCRYPT_GAS) +// Preserve the definition of SET for use in symcryptasm processing +#undef SET +#endif #if SYMCRYPT_CPU_AMD64 #define SYMCRYPT_CHECK_ASM_OFFSETS \ @@ -89,14 +62,9 @@ SymCryptModulusValueOffsetArm EQU 96; SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusNdigitsOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, nDigits ) );\ SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusMontgomeryInv64OffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, tm.montgomery.inv64 ));\ SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusValueOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, Divisor.Int.ti.fdef.uint32 ));\ - + #endif // CPU_* #if !defined( SYMCRYPT_CHECK_ASM_OFFSETS) #define SYMCRYPT_CHECK_ASM_OFFSETS #endif - - -; /* - ENDIF -; */ \ No newline at end of file diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 8617135..edd3586 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -96,42 +96,148 @@ set(SOURCES_COMMON IEEE802_11SaeCustom.c ) +function(process_cppasm filepath outformat archdefine) + get_filename_component(fileextension ${filepath} EXT) + if(NOT fileextension STREQUAL .cppasm) + message(FATAL_ERROR "cppasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})") + endif() + if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm)) + message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})") + endif() + if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86)) + message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})") + endif() + get_filename_component(rootpath ${filepath} DIRECTORY) + get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension + string(TOUPPER ${outformat} outformatupper) + string(TOUPPER ${archdefine} archdefineupper) + string(FIND ${rootpath} ${CMAKE_CURRENT_BINARY_DIR} findindex) # check whether input is in the output directory + if(findindex EQUAL -1) # input in the source directory + set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath}) + set(output_pass2 ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}/${filestem}-${outformat}.asm) + else() # input in the output directory + set(output_directory ${rootpath}) + set(output_pass2 ${rootpath}/${filestem}.asm) + endif() + + set(dbg_definition "") + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(dbg_definition "-DDBG=1") + endif() + + if(outformat STREQUAL gas) + # assume gas => GCC compatible C compiler + add_custom_command( + OUTPUT ${output_pass2} + COMMAND "${CMAKE_C_COMPILER}" -E -P -x c ${filepath} -o ${output_pass2} + -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc + -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition} + MAIN_DEPENDENCY ${filepath} + DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm + COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})" + VERBATIM) + elseif(outformat STREQUAL masm) + # assume masm => MSVC C compiler + add_custom_command( + OUTPUT ${output_pass2} + COMMAND "${CMAKE_C_COMPILER}" /EP /P /Fi${output_pass2} ${filepath} + -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc + -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition} + MAIN_DEPENDENCY ${filepath} + DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm + COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})" + VERBATIM) + endif() +endfunction() + +function(process_symcryptasm filepath outformat archdefine) + get_filename_component(fileextension ${filepath} EXT) + if(NOT fileextension STREQUAL .symcryptasm) + message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})") + endif() + if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm)) + message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})") + endif() + get_filename_component(rootpath ${filepath} DIRECTORY) + get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension + set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath}) + set(output_directory ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}) + set(output_cppasm ${output_directory}/${filestem}-${outformat}.cppasm) + + add_custom_command( + OUTPUT ${output_cppasm} + COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory} + COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm} + MAIN_DEPENDENCY ${filepath} + DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py + COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})" + VERBATIM) + + process_cppasm(${output_cppasm} ${outformat} ${archdefine}) +endfunction() + if(NOT WIN32) list(APPEND SOURCES_COMMON linux/intrinsics.c) - list(APPEND SOURCES_COMMON linux/asmstubs.c) endif() if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic")) if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") + process_symcryptasm(amd64/aesasm.symcryptasm masm amd64) + process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64) + process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64) + process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64) + process_symcryptasm(amd64/wipe.symcryptasm masm amd64) + list(APPEND SOURCES_COMMON - amd64/aesasm.asm - amd64/fdef_asm.asm - amd64/fdef_mulx.asm - amd64/fdef369_asm.asm - amd64/sha1asm.asm - amd64/wipe.asm) + amd64/aesasm-masm.asm + amd64/fdef_asm-masm.asm + amd64/fdef369_asm-masm.asm + amd64/fdef_mulx-masm.asm + amd64/wipe-masm.asm) set_source_files_properties( - amd64/aesasm.asm - amd64/fdef_asm.asm - amd64/fdef_mulx.asm - amd64/fdef369_asm.asm - amd64/sha1asm.asm - amd64/wipe.asm + amd64/aesasm-masm.asm + amd64/fdef_asm-masm.asm + amd64/fdef369_asm-masm.asm + amd64/fdef_mulx-masm.asm + amd64/wipe-masm.asm PROPERTY LANGUAGE ASM_MASM) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "X86") + process_cppasm(i386/fdef_asm.cppasm masm x86) + list(APPEND SOURCES_COMMON i386/aesasm.asm - i386/fdef_asm.asm - i386/rc4asm.asm - i386/sha1asm.asm + i386/fdef_asm-masm.asm i386/wipe.asm) set_source_files_properties( i386/aesasm.asm - i386/fdef_asm.asm - i386/rc4asm.asm - i386/sha1asm.asm + i386/fdef_asm-masm.asm i386/wipe.asm PROPERTY LANGUAGE ASM_MASM) + set_source_files_properties( + i386/fdef_asm-masm.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/i386) + endif() +else() + if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") + process_symcryptasm(amd64/aesasm.symcryptasm gas amd64) + process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64) + process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64) + process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64) + process_symcryptasm(amd64/wipe.symcryptasm gas amd64) + + list(APPEND SOURCES_COMMON + amd64/aesasm-gas.asm + amd64/fdef_asm-gas.asm + amd64/fdef369_asm-gas.asm + amd64/fdef_mulx-gas.asm + amd64/wipe-gas.asm) + set_source_files_properties( + amd64/aesasm-gas.asm + amd64/fdef_asm-gas.asm + amd64/fdef369_asm-gas.asm + amd64/fdef_mulx-gas.asm + amd64/wipe-gas.asm + PROPERTY LANGUAGE ASM) endif() endif() diff --git a/lib/a_dispatch.c b/lib/a_dispatch.c index de4448e..a5db79d 100644 --- a/lib/a_dispatch.c +++ b/lib/a_dispatch.c @@ -22,7 +22,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = { SYMCRYPT_MOD_FUNCTIONS_FDEF_GENERIC, // Handles any type of modulus SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY, // Montgomery, only for odd parity-public moduli -#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC +#if SYMCRYPT_CPU_AMD64 SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY, // optimized for 384 and 576-bit moduli SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY256, // Special faster code for 256-bit Montgomery moduli @@ -55,12 +55,12 @@ const UINT32 g_SymCryptModFnsMask = sizeof( g_SymCryptModFns ) - sizeof( g_SymCr // // Tweaking the selection & function tables allows different tradeoffs of performance vs codesize // -SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] = +SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] = { -#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC +#if SYMCRYPT_CPU_AMD64 // Mulx used for 257-512 and 577-... bits {('2M' << 16) + SymCryptModFntableMontgomery256, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, - {('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, + {('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, {('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, {('5M' << 16) + SymCryptModFntableMontgomery512, 0, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, {('9M' << 16) + SymCryptModFntable369Montgomery, 0, 576, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, @@ -118,9 +118,9 @@ SymCryptSizeofIntFromDigits( UINT32 nDigits ) PSYMCRYPT_INT SYMCRYPT_CALL -SymCryptIntCreate( - _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, - SIZE_T cbBuffer, +SymCryptIntCreate( + _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, + SIZE_T cbBuffer, UINT32 nDigits ) { return SymCryptFdefIntCreate( pbBuffer, cbBuffer, nDigits ); @@ -138,8 +138,8 @@ SymCryptIntWipe( _Out_ PSYMCRYPT_INT piDst ) VOID SYMCRYPT_CALL -SymCryptIntCopy( - _In_ PCSYMCRYPT_INT piSrc, +SymCryptIntCopy( + _In_ PCSYMCRYPT_INT piSrc, _Out_ PSYMCRYPT_INT piDst ) { SymCryptFdefIntCopy( piSrc, piDst ); @@ -191,8 +191,8 @@ SymCryptIntDigitsizeOfObject( _In_ PCSYMCRYPT_INT piSrc ) SYMCRYPT_ERROR SYMCRYPT_CALL -SymCryptIntCopyMixedSize( - _In_ PCSYMCRYPT_INT piSrc, +SymCryptIntCopyMixedSize( + _In_ PCSYMCRYPT_INT piSrc, _Out_ PSYMCRYPT_INT piDst ) { return SymCryptFdefIntCopyMixedSize( piSrc, piDst ); @@ -207,8 +207,8 @@ SymCryptIntBitsizeOfValue( _In_ PCSYMCRYPT_INT piSrc ) VOID SYMCRYPT_CALL -SymCryptIntSetValueUint32( - UINT32 u32Src, +SymCryptIntSetValueUint32( + UINT32 u32Src, _Out_ PSYMCRYPT_INT piDst ) { SymCryptFdefIntSetValueUint32( u32Src, piDst ); @@ -216,8 +216,8 @@ SymCryptIntSetValueUint32( VOID SYMCRYPT_CALL -SymCryptIntSetValueUint64( - UINT64 u64Src, +SymCryptIntSetValueUint64( + UINT64 u64Src, _Out_ PSYMCRYPT_INT piDst ) { SymCryptFdefIntSetValueUint64( u64Src, piDst ); @@ -225,10 +225,10 @@ SymCryptIntSetValueUint64( SYMCRYPT_ERROR SYMCRYPT_CALL -SymCryptIntSetValue( - _In_reads_bytes_(cbSrc) PCBYTE pbSrc, - SIZE_T cbSrc, - SYMCRYPT_NUMBER_FORMAT format, +SymCryptIntSetValue( + _In_reads_bytes_(cbSrc) PCBYTE pbSrc, + SIZE_T cbSrc, + SYMCRYPT_NUMBER_FORMAT format, _Out_ PSYMCRYPT_INT piDst ) { return SymCryptFdefIntSetValue( pbSrc, cbSrc, format, piDst ); @@ -236,10 +236,10 @@ SymCryptIntSetValue( SYMCRYPT_ERROR SYMCRYPT_CALL -SymCryptIntGetValue( - _In_ PCSYMCRYPT_INT piSrc, - _Out_writes_bytes_( cbDst ) PBYTE pbDst, - SIZE_T cbDst, +SymCryptIntGetValue( + _In_ PCSYMCRYPT_INT piSrc, + _Out_writes_bytes_( cbDst ) PBYTE pbDst, + SIZE_T cbDst, SYMCRYPT_NUMBER_FORMAT format ) { return SymCryptFdefIntGetValue( piSrc, pbDst, cbDst, format ); @@ -496,9 +496,9 @@ SymCryptSizeofDivisorFromDigits( UINT32 nDigits ) PSYMCRYPT_DIVISOR SYMCRYPT_CALL -SymCryptDivisorCreate( - _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, - SIZE_T cbBuffer, +SymCryptDivisorCreate( + _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, + SIZE_T cbBuffer, UINT32 nDigits ) { return SymCryptFdefDivisorCreate( pbBuffer, cbBuffer, nDigits ); @@ -514,8 +514,8 @@ SymCryptDivisorWipe( _Out_ PSYMCRYPT_DIVISOR pdObj ) } VOID -SymCryptDivisorCopy( - _In_ PCSYMCRYPT_DIVISOR pdSrc, +SymCryptDivisorCopy( + _In_ PCSYMCRYPT_DIVISOR pdSrc, _Out_ PSYMCRYPT_DIVISOR pdDst ) { SymCryptFdefDivisorCopy( pdSrc, pdDst ); @@ -585,9 +585,9 @@ SymCryptSizeofModulusFromDigits( UINT32 nDigits ) PSYMCRYPT_MODULUS SYMCRYPT_CALL -SymCryptModulusCreate( - _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, - SIZE_T cbBuffer, +SymCryptModulusCreate( + _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, + SIZE_T cbBuffer, UINT32 nDigits ) { return SymCryptFdefModulusCreate( pbBuffer, cbBuffer, nDigits ); @@ -604,7 +604,7 @@ SymCryptModulusWipe( _Out_ PSYMCRYPT_MODULUS pmObj ) VOID SymCryptModulusCopy( - _In_ PCSYMCRYPT_MODULUS pmSrc, + _In_ PCSYMCRYPT_MODULUS pmSrc, _Out_ PSYMCRYPT_MODULUS pmDst ) { SymCryptFdefModulusCopy( pmSrc, pmDst ); @@ -626,8 +626,8 @@ SymCryptModElementAllocate( _In_ PCSYMCRYPT_MODULUS pmMod ) VOID SYMCRYPT_CALL -SymCryptModElementFree( - _In_ PCSYMCRYPT_MODULUS pmMod, +SymCryptModElementFree( + _In_ PCSYMCRYPT_MODULUS pmMod, _Out_ PSYMCRYPT_MODELEMENT peObj ) { SymCryptFdefModElementFree( pmMod, peObj ); @@ -642,9 +642,9 @@ SymCryptSizeofModElementFromModulus( PCSYMCRYPT_MODULUS pmMod ) PSYMCRYPT_MODELEMENT SYMCRYPT_CALL -SymCryptModElementCreate( - _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, - SIZE_T cbBuffer, +SymCryptModElementCreate( + _Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer, + SIZE_T cbBuffer, PCSYMCRYPT_MODULUS pmMod ) { return SymCryptFdefModElementCreate( pbBuffer, cbBuffer, pmMod ); @@ -660,9 +660,9 @@ SymCryptModElementWipe( } VOID -SymCryptModElementCopy( +SymCryptModElementCopy( _In_ PCSYMCRYPT_MODULUS pmMod, - _In_ PCSYMCRYPT_MODELEMENT peSrc, + _In_ PCSYMCRYPT_MODELEMENT peSrc, _Out_ PSYMCRYPT_MODELEMENT peDst ) { SymCryptFdefModElementCopy( pmMod, peSrc, peDst ); @@ -671,7 +671,7 @@ SymCryptModElementCopy( VOID SymCryptModElementMaskedCopy( _In_ PCSYMCRYPT_MODULUS pmMod, - _In_ PCSYMCRYPT_MODELEMENT peSrc, + _In_ PCSYMCRYPT_MODELEMENT peSrc, _Out_ PSYMCRYPT_MODELEMENT peDst, UINT32 mask ) { @@ -753,7 +753,7 @@ SymCryptModElementToInt( PCUINT32 pData; SYMCRYPT_ASSERT( piDst->nDigits >= pmMod->nDigits ); - + pData = SYMCRYPT_MOD_CALL( pmMod ) modPreGet( pmMod, peSrc, pbScratch, cbScratch ); SymCryptFdefModElementToIntGeneric( pmMod, pData, piDst, pbScratch, cbScratch ); @@ -762,17 +762,17 @@ SymCryptModElementToInt( SYMCRYPT_DISABLE_CFG SYMCRYPT_ERROR SYMCRYPT_CALL -SymCryptModElementSetValue( - _In_reads_bytes_( cbSrc ) PCBYTE pbSrc, - SIZE_T cbSrc, - SYMCRYPT_NUMBER_FORMAT format, +SymCryptModElementSetValue( + _In_reads_bytes_( cbSrc ) PCBYTE pbSrc, + SIZE_T cbSrc, + SYMCRYPT_NUMBER_FORMAT format, PCSYMCRYPT_MODULUS pmMod, _Out_ PSYMCRYPT_MODELEMENT peDst, _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, SIZE_T cbScratch ) { SYMCRYPT_ERROR scError; - + scError = SymCryptFdefModElementSetValueGeneric( pbSrc, cbSrc, format, pmMod, peDst, pbScratch, cbScratch ); if( scError == SYMCRYPT_NO_ERROR ) @@ -785,11 +785,11 @@ SymCryptModElementSetValue( SYMCRYPT_ERROR SYMCRYPT_CALL -SymCryptModElementGetValue( +SymCryptModElementGetValue( PCSYMCRYPT_MODULUS pmMod, _In_ PCSYMCRYPT_MODELEMENT peSrc, - _Out_writes_bytes_( cbDst ) PBYTE pbDst, - SIZE_T cbDst, + _Out_writes_bytes_( cbDst ) PBYTE pbDst, + SIZE_T cbDst, SYMCRYPT_NUMBER_FORMAT format, _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, SIZE_T cbScratch ) @@ -889,8 +889,8 @@ SymCryptModNeg( SYMCRYPT_DISABLE_CFG VOID SYMCRYPT_CALL -SymCryptModElementSetValueUint32( - UINT32 value, +SymCryptModElementSetValueUint32( + UINT32 value, _In_ PCSYMCRYPT_MODULUS pmMod, _Out_ PSYMCRYPT_MODELEMENT peDst, _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, @@ -903,8 +903,8 @@ SymCryptModElementSetValueUint32( VOID SYMCRYPT_CALL -SymCryptModElementSetValueNegUint32( - UINT32 value, +SymCryptModElementSetValueNegUint32( + UINT32 value, _In_ PCSYMCRYPT_MODULUS pmMod, _Out_ PSYMCRYPT_MODELEMENT peDst, _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, @@ -994,7 +994,7 @@ SymCryptCreateTrialDivisionContext( UINT32 nDigits ) UINT32 SYMCRYPT_CALL -SymCryptIntFindSmallDivisor( +SymCryptIntFindSmallDivisor( _In_ PCSYMCRYPT_TRIALDIVISION_CONTEXT pContext, _In_ PCSYMCRYPT_INT piSrc, _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, diff --git a/lib/amd64/aesasm.asm b/lib/amd64/aesasm.asm deleted file mode 100644 index dbb820a..0000000 --- a/lib/amd64/aesasm.asm +++ /dev/null @@ -1,1657 +0,0 @@ -; -; AesAsm.asm Assembler code for fast AES on the amd64 -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; This code is derived from the AesFast implemenation that -; Niels Ferguson wrote from scratch for BitLocker during Vista. -; That code is still in RSA32. -; - -include ksamd64.inc - -include symcrypt_version.inc -include symcrypt_magic.inc - - TITLE "Advanced Encryption Standard (AES)" - -USE_BLOCK_FUNCTION EQU 1 ; Set to 1 to use block function, 0 to use block macro - -; -; Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure. -; - -N_ROUND_KEYS_IN_AESKEY EQU 29 - -SYMCRYPT_AES_EXPANDED_KEY struct - RoundKey dq 2*N_ROUND_KEYS_IN_AESKEY dup (?) ; - lastEncRoundKey dq ? ; pointer to last enc round key - lastDecRoundKey dq ? ; pointer to last dec round key - - SYMCRYPT_MAGIC_FIELD - -SYMCRYPT_AES_EXPANDED_KEY ends - - - extern SymCryptAesSboxMatrixMult:DWORD - extern SymCryptAesInvSboxMatrixMult:DWORD -; extern SymCryptAesSbox:BYTE ; Not used - extern SymCryptAesInvSbox:BYTE - -; -; Shorthand for the 4 tables we will use -; We always use r11 to point to the (inv) SboxMatrixMult tables -; -SMM0 EQU r11 -SMM1 EQU r11 + 0400h -SMM2 EQU r11 + 0800h -SMM3 EQU r11 + 0c00h - -ISMM0 EQU r11 -ISMM1 EQU r11 + 0400h -ISMM2 EQU r11 + 0800h -ISMM3 EQU r11 + 0c00h - - - - -ENC_MIX MACRO keyptr - ; - ; Perform the unkeyed mixing function for encryption - ; plus a key addition from the key pointer - ; - ; input:block is in eax, ebx, ecx, edx; r11 points to AesSboxMatrixMult - ; New state ends up in eax, ebx, ecx, edx - ; Used registers: esi, edi, ebp, r8 - - ; - ; We can use the e registers for the movzx as the - ; upper 32 bits are automatically set to 0. This saves - ; prefix bytes - ; - ; We use 32-bit registers to store the state. - ; We tried using 64-bit registers, but the extra shifts - ; cost too much. - ; Using 32-bit throughout makes the key xor more expensive - ; but we avoid having to combine the 32-bit halves into - ; 64 bit. - ; - - movzx esi,al - mov esi,[SMM0 + 4 * rsi] - movzx edi,ah - shr eax,16 - mov r8d,[SMM1 + 4 * rdi] - movzx ebp,al - mov ebp,[SMM2 + 4 * rbp] - movzx edi,ah - mov edi,[SMM3 + 4 * rdi] - - movzx eax,bl - xor edi,[SMM0 + 4 * rax] - movzx eax,bh - shr ebx,16 - xor esi,[SMM1 + 4 * rax] - movzx eax,bl - xor r8d,[SMM2 + 4 * rax] - movzx eax,bh - xor ebp,[SMM3 + 4 * rax] - - movzx eax,cl - xor ebp,[SMM0 + 4 * rax] - movzx ebx,ch - shr ecx,16 - xor edi,[SMM1 + 4 * rbx] - movzx eax,cl - xor esi,[SMM2 + 4 * rax] - movzx ebx,ch - xor r8d,[SMM3 + 4 * rbx] - - movzx eax,dl - xor r8d,[SMM0 + 4 * rax] - movzx ebx,dh - shr edx,16 - xor ebp,[SMM1 + 4 * rbx] - movzx eax,dl - xor edi,[SMM2 + 4 * rax] - movzx ebx,dh - xor esi,[SMM3 + 4 * rbx] - - mov eax, [keyptr] - mov ebx, [keyptr + 4] - xor eax, esi - mov ecx, [keyptr + 8] - xor ebx, edi - mov edx, [keyptr + 12] - xor ecx, ebp - xor edx, r8d - - ENDM - - -DEC_MIX MACRO keyptr - ; - ; Perform the unkeyed mixing function for decryption - ; - ; input:block is in eax, ebx, ecx, edx - ; r11 points to AesInvSboxMatrixMult - ; New state ends up in esi, edi, ebp, r8d - - movzx esi,al - mov esi,[ISMM0 + 4 * rsi] - movzx edi,ah - shr eax,16 - mov edi,[ISMM1 + 4 * rdi] - movzx ebp,al - mov ebp,[ISMM2 + 4 * rbp] - movzx eax,ah - mov r8d,[ISMM3 + 4 * rax] - - movzx eax,bl - xor edi,[ISMM0 + 4 * rax] - movzx eax,bh - shr ebx,16 - xor ebp,[ISMM1 + 4 * rax] - movzx eax,bl - xor r8d,[ISMM2 + 4 * rax] - movzx eax,bh - xor esi,[ISMM3 + 4 * rax] - - movzx eax,cl - xor ebp,[ISMM0 + 4 * rax] - movzx ebx,ch - shr ecx,16 - xor r8d,[ISMM1 + 4 * rbx] - movzx eax,cl - xor esi,[ISMM2 + 4 * rax] - movzx ebx,ch - xor edi,[ISMM3 + 4 * rbx] - - movzx eax,dl - xor r8d,[ISMM0 + 4 * rax] - movzx ebx,dh - shr edx,16 - xor esi,[ISMM1 + 4 * rbx] - movzx eax,dl - xor edi,[ISMM2 + 4 * rax] - movzx ebx,dh - xor ebp,[ISMM3 + 4 * rbx] - - mov eax, [keyptr] - mov ebx, [keyptr + 4] - xor eax, esi - mov ecx, [keyptr + 8] - xor ebx, edi - mov edx, [keyptr + 12] - xor ecx, ebp - xor edx, r8d - - ENDM - - - -AES_ENCRYPT_MACRO MACRO - ; - ; Plaintext in eax, ebx, ecx, edx - ; r9 points to first round key to use (modified) - ; r10 is last key to use (unchanged) - ; r11 points to SboxMatrixMult (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - ; This macro is free to unroll the cipher completely, or to use a loop - ; over r9 - ; - - ; - ; xor in first round key - ; - xor eax,[r9] - xor ebx,[r9+4] - xor ecx,[r9+8] - xor edx,[r9+12] - - ENC_MIX r9+16 - - ENC_MIX r9+32 - - ENC_MIX r9+48 - - ENC_MIX r9+64 - - ENC_MIX r9+80 - - ENC_MIX r9+96 - - add r9,160 - - ENC_MIX r9-48 - - ;align 16 - -@@: - ; Block is eax, ebx, ecx, edx - ; r9-16 points to next round key - - ENC_MIX r9-32 - - ENC_MIX r9-16 - - cmp r9,r10 - lea r9,[r9+32] - jc @B - - ; - ; Now for the final round - ; We use the fact that SboxMatrixMult[0] table is also - ; an Sbox table if you use the second element of each entry. - ; - ; Result is in esi, edi, ebp, r8d - ; - - movzx esi,al - movzx esi,byte ptr[r11 + 1 + 4*rsi] - movzx edi,ah - shr eax,16 - movzx r8d,byte ptr[r11 + 1 + 4*rdi] - movzx ebp,al - shl r8d,8 - movzx ebp,byte ptr[r11 + 1 + 4*rbp] - shl ebp,16 - movzx edi,ah - movzx edi,byte ptr[r11 + 1 + 4*rdi] - shl edi,24 - - movzx eax,bl - movzx eax,byte ptr[r11 + 1 + 4*rax] - or edi,eax - movzx eax,bh - shr ebx,16 - movzx eax,byte ptr[r11 + 1 + 4*rax] - shl eax,8 - or esi,eax - movzx eax,bl - movzx eax,byte ptr[r11 + 1 + 4*rax] - movzx ebx,bh - shl eax,16 - movzx ebx,byte ptr[r11 + 1 + 4*rbx] - or r8d,eax - shl ebx,24 - or ebp,ebx - - movzx eax,cl - movzx ebx,ch - movzx eax,byte ptr[r11 + 1 + 4*rax] - shr ecx,16 - movzx ebx,byte ptr[r11 + 1 + 4*rbx] - shl ebx,8 - or ebp,eax - or edi,ebx - movzx eax,cl - movzx eax,byte ptr[r11 + 1 + 4*rax] - movzx ebx,ch - movzx ebx,byte ptr[r11 + 1 + 4*rbx] - shl eax,16 - shl ebx,24 - or esi,eax - or r8d,ebx - - movzx eax,dl - movzx ebx,dh - movzx eax,byte ptr[r11 + 1 + 4*rax] - shr edx,16 - movzx ebx,byte ptr[r11 + 1 + 4*rbx] - shl ebx,8 - or r8d,eax - or ebp,ebx - movzx eax,dl - movzx eax,byte ptr[r11 + 1 + 4*rax] - movzx ebx,dh - movzx ebx,byte ptr[r11 + 1 + 4*rbx] - shl eax,16 - shl ebx,24 - or edi,eax - or esi,ebx - - ; - ; xor in final round key - ; - - xor r8d,[r10+12] - xor esi,[r10] - xor edi,[r10+4] - xor ebp,[r10+8] - - ENDM - -AES_DECRYPT_MACRO MACRO - ; - ; Ciphertext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to InvSboxMatrixMult (unchanged) - ; r12 points to InvSbox (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - - - ; - ; xor in first round key - ; - xor eax,[r9] - xor ebx,[r9+4] - xor ecx,[r9+8] - xor edx,[r9+12] - - DEC_MIX r9+16 - - DEC_MIX r9+32 - - DEC_MIX r9+48 - - DEC_MIX r9+64 - - DEC_MIX r9+80 - - DEC_MIX r9+96 - - add r9,160 - - DEC_MIX r9-48 - - ;align 16 - -@@: - ; Block is eax, ebx, ecx, edx - ; r9-32 points to next round key - - DEC_MIX r9-32 - - DEC_MIX r9-16 - - cmp r9,r10 - lea r9,[r9+32] - jc @B - - ; - ; Now for the final round - ; Result is in esi, edi, ebp, r8d - ; - - movzx esi,al - movzx esi,byte ptr[r12 + rsi] - movzx edi,ah - shr eax,16 - movzx edi,byte ptr[r12 + rdi] - movzx ebp,al - shl edi,8 - movzx ebp,byte ptr[r12 + rbp] - shl ebp,16 - movzx eax,ah - movzx r8d,byte ptr[r12 + rax] - shl r8d,24 - - movzx eax,bl - movzx eax,byte ptr[r12 + rax] - or edi,eax - movzx eax,bh - shr ebx,16 - movzx eax,byte ptr[r12 + rax] - shl eax,8 - or ebp,eax - movzx eax,bl - movzx eax,byte ptr[r12 + rax] - movzx ebx,bh - shl eax,16 - movzx ebx,byte ptr[r12 + rbx] - or r8d,eax - shl ebx,24 - or esi,ebx - - movzx eax,cl - movzx ebx,ch - movzx eax,byte ptr[r12 + rax] - shr ecx,16 - movzx ebx,byte ptr[r12 + rbx] - shl ebx,8 - or ebp,eax - or r8d,ebx - movzx eax,cl - movzx eax,byte ptr[r12 + rax] - movzx ebx,ch - movzx ebx,byte ptr[r12 + rbx] - shl eax,16 - shl ebx,24 - or esi,eax - or edi,ebx - - movzx eax,dl - movzx ebx,dh - movzx eax,byte ptr[r12 + rax] - shr edx,16 - movzx ebx,byte ptr[r12 + rbx] - shl ebx,8 - or r8d,eax - or esi,ebx - movzx eax,dl - movzx eax,byte ptr[r12 + rax] - movzx ebx,dh - movzx ebx,byte ptr[r12 + rbx] - shl eax,16 - shl ebx,24 - or edi,eax - or ebp,ebx - - ; - ; xor in final round key - ; - - xor esi,[r10] - xor edi,[r10+4] - xor ebp,[r10+8] - xor r8d,[r10+12] - - ENDM - -if 0 -AES_ENCRYPT_XMM MACRO - ; xmm0 contains the plaintext - ; rcx points to first round key to use - ; r10 is last key to use (unchanged) - ; Ciphertext ends up in xmm0 - ; - - ; - ; xor in first round key; round keys are 16-aligned on amd64 - ; - pxor xmm0,[rcx] - aesenc xmm0,[rcx+16] - - aesenc xmm0,[rcx+32] - aesenc xmm0,[rcx+48] - aesenc xmm0,[rcx+64] - aesenc xmm0,[rcx+80] - aesenc xmm0,[rcx+96] - aesenc xmm0,[rcx+112] - add rcx, 128 - -@@: - ; r9 points to next round key - - aesenc xmm0, [rcx] - aesenc xmm0, [rcx+16] - - add rcx, 32 - cmp rcx,r10 - jc @B - - ; - ; Now for the final round - ; - aesenclast xmm0, [r10] - - ENDM - - -AES_DECRYPT_XMM MACRO - ; xmm0 contains the ciphertext - ; rcx points to first round key to use - ; r10 is last key to use (unchanged) - ; Plaintext ends up in xmm0 - ; - - ; - ; xor in first round key; round keys are 16-aligned on amd64 - ; - pxor xmm0,[rcx] - aesdec xmm0,[rcx+16] - - aesdec xmm0,[rcx+32] - aesdec xmm0,[rcx+48] - aesdec xmm0,[rcx+64] - aesdec xmm0,[rcx+80] - aesdec xmm0,[rcx+96] - aesdec xmm0,[rcx+112] - add rcx, 128 - -@@: - ; r9 points to next round key - - aesdec xmm0, [rcx] - aesdec xmm0, [rcx+16] - - add rcx, 32 - cmp rcx,r10 - jc @B - - ; - ; Now for the final round - ; - aesdeclast xmm0, [r10] - - - ENDM -endif - - IF USE_BLOCK_FUNCTION - - ; - ; We use a block function, the AES_ENCRYPT macro merely calls the function - ; - -AES_ENCRYPT MACRO - call SymCryptAesEncryptAsmInternal - ENDM - -AES_DECRYPT MACRO - call SYmCryptAesDecryptAsmInternal - ENDM - -;======================================== -; SymCryptAesEncryptAsmInternal -; -; Internal AES encryption routine with modified calling convention. -; This function has the exact same calling convention as the AES_ENCRYPT_MACRO - - - LEAF_ENTRY SymCryptAesEncryptAsmInternal, _TEXT - - AES_ENCRYPT_MACRO - - ret - - LEAF_END SymCryptAesEncryptAsmInternal, _TEXT - - -;======================================== -; SymCryptAesDecryptAsmInternal -; -; Internal AES encryption routine with modified calling convention. -; This function has the exact same calling convention as the AES_DECRYPT_MACRO -; - - - LEAF_ENTRY SymCryptAesDecryptAsmInternal, _TEXT - - AES_DECRYPT_MACRO - - ret - - LEAF_END SymCryptAesDecryptAsmInternal, _TEXT - - - ELSE - - ; - ; No block function, use the macro directly - ; - -AES_ENCRYPT MACRO - AES_ENCRYPT_MACRO - ENDM - -AES_DECRYPT MACRO - AES_DECRYPT_MACRO - ENDM - - ENDIF - - - -; -;VOID -;SYMCRYPT_CALL -;SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext, -; _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext ); -; - -SymCryptAesEncryptFrame struct - -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? -CallerP4Home dq ? - -SymCryptAesEncryptFrame ends - - NESTED_ENTRY SymCryptAesEncryptAsm, _TEXT - - ; - ; Prologue - ; Pushes are as fast as stores and smaller, so we use those - ; - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - ; - ; At this point the stack is not properly aligned, but as we only call our own internal function - ; with a modified calling convention this is not a problem. (Interrupt routines can deal with - ; unaligned stack, and the stack _will_ be aligned during the actual AES work.) - ; - - - ; Parameters passed: - ; rcx = pExpandedKey - ; rdx = pbPlaintext - ; r8 = pbCiphertext - ; - - mov r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - mov r9, rcx - - mov [rsp + SymCryptAesEncryptFrame.CallerP3Home], r8 - - ; - ; Load the plaintext - ; - mov eax,[rdx ] - mov ebx,[rdx + 4] - mov ecx,[rdx + 8] - mov edx,[rdx + 12] - - lea r11,[SymCryptAesSboxMatrixMult] - - AES_ENCRYPT - ; - ; Plaintext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to SboxMatrixMult (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - - mov rdx,[rsp + SymCryptAesEncryptFrame.CallerP3Home] - mov [rdx ], esi - mov [rdx + 4], edi - mov [rdx + 8], ebp - mov [rdx + 12], r8d - -SymCryptAesEncryptAsmDone: - - BEGIN_EPILOGUE - - pop rdi - pop rsi - pop rbp - pop rbx - ret - - - NESTED_END SymCryptAesEncryptAsm, _TEXT - - -; -;VOID -;SYMCRYPT_CALL -;SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext, -; _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext ); -; - - - NESTED_ENTRY SymCryptAesDecryptAsm, _TEXT - -SymCryptAesDecryptFrame struct - -SaveR12 dq ? -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -pExpandedKeyHome dq ? -pbCiphertextHome dq ? -pbPlaintextHome dq ? -CallerP4Home dq ? - -SymCryptAesDecryptFrame ends - ; - ; Prologue - ; Pushes are as fast as stores and smaller, so we use those - ; - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - push_reg r12 - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - ; - ; At this point the stack is not properly aligned, but as we only call our own internal function - ; with a modified calling convention this is not a problem. (Interrupt routines can deal with - ; unaligned stack, and the stack _will_ be aligned during the actual AES work.) - ; - - - ; Parameters passed: - ; rcx = pExpandedKey - ; rdx = pbCiphertext - ; r8 = pbPlaintext - ; - - mov r9,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - mov r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey] - - mov [rsp + SymCryptAesDecryptFrame.pbCiphertextHome], r8 - - mov eax,[rdx] - mov ebx,[rdx+4] - mov ecx,[rdx+8] - mov edx,[rdx+12] - - - lea r11,[SymCryptAesInvSboxMatrixMult] - lea r12,[SymCryptAesInvSbox] - - AES_DECRYPT - ; Ciphertext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to InvSboxMatrixMult (unchanged) - ; r12 points to InvSbox (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - - mov rdx,[rsp + SymCryptAesDecryptFrame.pbCiphertextHome] ; retrieve bpPlaintext - mov [rdx],esi - mov [rdx+4],edi - mov [rdx+8],ebp - mov [rdx+12],r8d - -SymCryptAesDecryptAsmDone: - - BEGIN_EPILOGUE - - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - ret - - NESTED_END SymCryptAesDecryptAsm, _TEXT - - - -;VOID -;SYMCRYPT_CALL -;SymCryptAesCbcEncrypt( -; _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; SIZE_T cbData ); - - NESTED_ENTRY SymCryptAesCbcEncryptAsm, _TEXT - -AesCbcEncryptFrame struct - -SaveR15 dq ? -SaveR14 dq ? -SaveR13 dq ? -SaveR12 dq ? -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? -CallerP4Home dq ? -cbData dq ? - -AesCbcEncryptFrame ends - - ; - ; rcx = pExpandedKey - ; rdx = pbChainingValue - ; r8 = pbSrc - ; r9 = pbDst - ; [rsp+28] = cbData - - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - mov r15,[rsp + AesCbcEncryptFrame.cbData] - - mov [rsp + AesCbcEncryptFrame.CallerP2Home], rdx ; save pbChainingValue - - mov r13, r8 ; r13 = pbSrc - - - and r15, NOT 15 - jz SymCryptAesCbcEncryptNoData - - add r15, r8 - - mov r14, r9 ; r14 = pbDst - - mov r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] ; r10 = last enc round key - - ; - ; Load the chaining state from pbChainingValue - ; - mov esi,[rdx] - mov edi,[rdx+4] - mov ebp,[rdx+8] - mov r8d,[rdx+12] - - - mov r12,rcx ; r12 = first round key to use - - lea r11,[SymCryptAesSboxMatrixMult] - - - align 16 -SymCryptAesCbcEncryptAsmLoop: - ; Loop register setup - ; r10 = last round key to use - ; r12 = first round key to use - ; r13 = pbSrc - ; r14 = pbDst - ; r15 = pbSrcEnd - - ; chaining state in (esi,edi,ebp,r8d) - - mov eax, [r13] - mov r9, r12 - mov ebx, [r13+4] - xor eax, esi - mov ecx, [r13+8] - xor ebx, edi - xor ecx, ebp - mov edx, [r13+12] - xor edx, r8d - - add r13, 16 - - - AES_ENCRYPT - ; - ; Plaintext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to SboxMatrixMult (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - - mov [r14], esi - mov [r14+4], edi - mov [r14+8], ebp - mov [r14+12], r8d - - add r14, 16 - - cmp r13, r15 - - jb SymCryptAesCbcEncryptAsmLoop - - - ; - ; Update the chaining value - ; - mov rdx,[rsp + AesCbcEncryptFrame.CallerP2Home] - mov [rdx], esi - mov [rdx+4], edi - mov [rdx+8], ebp - mov [rdx+12], r8d - -SymCryptAesCbcEncryptNoData: -SymCryptAesCbcEncryptDone: - - BEGIN_EPILOGUE - - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - ret - - NESTED_END SymCryptAesCbcEncryptAsm, _TEXT - - - -;VOID -;SYMCRYPT_CALL -;SymCryptAesCbcDecrypt( -; _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; SIZE_T cbData ); - - NESTED_ENTRY SymCryptAesCbcDecryptAsm, _TEXT - -AesCbcDecryptFrame struct - -SaveR15 dq ? -SaveR14 dq ? -SaveR13 dq ? -SaveR12 dq ? -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? ;Tmp1 -CallerP2Home dq ? ;pbChainingValue -CallerP3Home dq ? ;pbSrc -CallerP4Home dq ? ;Tmp2 -cbData dq ? - -AesCbcDecryptFrame ends - - ; - ; rcx = pExpandedKey - ; rdx = pbChainingValue - ; r8 = pbSrc - ; r9 = pbDst - ; [rsp+28] = cbData - - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - mov r14,[rsp + AesCbcDecryptFrame.cbData] - - and r14, NOT 15 - jz SymCryptAesCbcDecryptNoData - - mov r13,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - mov r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey] - - mov [rsp + AesCbcDecryptFrame.CallerP2Home], rdx ;pbChainingValue - mov [rsp + AesCbcDecryptFrame.CallerP3Home], r8 ;pbSrc - sub r14, 16 - - lea r15,[r9 + r14] ; r15 = pbDst pointed to last block - add r14, r8 ; r14 = pbSrc pointed to last block - - lea r11,[SymCryptAesInvSboxMatrixMult] - lea r12,[SymCryptAesInvSbox] - - ; - ; Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later) - ; - mov eax,[r14] - mov ebx,[r14+4] - mov ecx,[r14+8] - mov edx,[r14+12] - - mov dword ptr [rsp + AesCbcDecryptFrame.CallerP1Home], eax - mov dword ptr [rsp + AesCbcDecryptFrame.CallerP1Home+4], ebx - mov dword ptr [rsp + AesCbcDecryptFrame.CallerP4Home], ecx - mov dword ptr [rsp + AesCbcDecryptFrame.CallerP4Home+4], edx - - jmp SymCryptAesCbcDecryptAsmLoopEntry - - align 16 - -SymCryptAesCbcDecryptAsmLoop: - ; Loop register setup - ; r13 = first round key to use - ; r14 = pbSrc - ; r15 = pbDst - ; [callerP3Home] = pbSrcStart - - ; current ciphertext block (esi,edi,ebp,r8d) - - mov eax,[r14-16] - mov ebx,[r14-12] - xor esi,eax - mov ecx,[r14-8] - xor edi,ebx - mov [r15],esi - mov edx,[r14-4] - xor ebp,ecx - mov [r15+4],edi - xor r8d,edx - mov [r15+8],ebp - mov [r15+12],r8d - - sub r14,16 - sub r15,16 - -SymCryptAesCbcDecryptAsmLoopEntry: - - mov r9, r13 - - AES_DECRYPT - ; - ; Ciphertext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to InvSboxMatrixMult (unchanged) - ; r12 points to InvSbox (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - - cmp r14, [rsp + AesCbcDecryptFrame.CallerP3Home] ; pbSrc - ja SymCryptAesCbcDecryptAsmLoop - - mov rbx,[rsp + AesCbcDecryptFrame.CallerP2Home] ; pbChainingValue - xor esi,[rbx] - xor edi,[rbx+4] - xor ebp,[rbx+8] - xor r8d,[rbx+12] - - mov [r15], esi - mov [r15+4], edi - mov [r15+8], ebp - mov [r15+12], r8d - - ; - ; Update the chaining value to the last ciphertext block - ; - mov rax,[rsp + AesCbcDecryptFrame.CallerP1Home] - mov rcx,[rsp + AesCbcDecryptFrame.CallerP4Home] - mov [rbx], rax - mov [rbx+8], rcx - -SymCryptAesCbcDecryptNoData: - - BEGIN_EPILOGUE - - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - ret - - NESTED_END SymCryptAesCbcDecryptAsm, _TEXT - - - -;VOID -;SYMCRYPT_CALL -;SymCryptAesCtrMsb64( -; _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; SIZE_T cbData ); - - NESTED_ENTRY SymCryptAesCtrMsb64Asm, _TEXT - -AesCtrMsb64Frame struct - -SaveR15 dq ? -SaveR14 dq ? -SaveR13 dq ? -SaveR12 dq ? -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? ; used to store the first half of the chaining state -CallerP4Home dq ? ; used to store the second half of the chaining state -cbData dq ? - -AesCtrMsb64Frame ends - - ; - ; rcx = pExpandedKey - ; rdx = pbChainingValue - ; r8 = pbSrc - ; r9 = pbDst - ; [rsp+28] = cbData - - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - mov r14,[rsp + AesCtrMsb64Frame.cbData] - and r14, NOT 15 ; only deal with whole # blocks - jz SymCryptAesCtrMsb64NoData - - add r14, r8 ; cbData + pbSrc = pbSrcEnd - - mov [rsp + AesCtrMsb64Frame.CallerP2Home], rdx ; save pbChainingState - mov r12, rcx ; r12 = first round key to use - mov r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] ; r10 = last enc round key - - mov r13, r8 ; pbSrc - mov r15, r9 ; pbDst - - lea r11,[SymCryptAesSboxMatrixMult] - - ; - ; Load the chaining state - ; - mov rax, [rdx + 0] - mov rcx, [rdx + 8] - - ; - ; Store it in our local copy (we have no register free to keep pbChainingState in) - ; - mov [rsp + AesCtrMsb64Frame.CallerP3Home + 0], rax - mov [rsp + AesCtrMSb64Frame.CallerP3Home + 8], rcx - - ; - ; Move to the right registers - ; - mov rbx, rax - mov rdx, rcx - shr rbx, 32 - shr rdx, 32 - - align 16 -SymCryptAesCtrMsb64AsmLoop: - ; Loop invariant - ; Current chaining state is in (eax, ebx, ecx, edx) - ; r10 = last round key to use - ; r11 = SboxMatrixMult - ; r12 = first round key to use - ; r13 = pbSrc - ; r14 = pbSrcEnd - ; r15 = pbDst - ; [rsp + CallerP3Home] = 16 bytes chaining state block - - mov r9, r12 - - AES_ENCRYPT - ; - ; Plaintext in eax, ebx, ecx, edx - ; r9 points to first round key to use - ; r10 is last key to use (unchanged) - ; r11 points to SboxMatrixMult (unchanged) - ; Ciphertext ends up in esi, edi, ebp, r8d - ; - - ; To improve latency, we FIRST - ; load the chaining state, increment the counter, and write it back. - ; leave the state in the (eax, ebx, ecx, edx) registers - - mov eax,dword ptr [rsp + AesCtrMsb64Frame.CallerP3Home + 0] - mov ebx,dword ptr [rsp + AesCtrMsb64Frame.CallerP3Home + 4] - mov rcx,[rsp + AesCtrMsb64Frame.CallerP3Home + 8 ] - bswap rcx - add rcx, 1 - bswap rcx - mov [rsp + AesCtrMsb64Frame.CallerP3Home + 8], rcx - mov rdx, rcx - shr rdx, 32 - - ; THEN we process the XOR of the key stream with the data - ; This order is faster as we need to have the chaining state done - ; before we can proceed, but there are no dependencies on the data result - ; So we can loop back to the beginning while the data stream read/writes are - ; still in flight. - ; - ; xor with the source stream - - xor esi,[r13 + 0 ] - xor edi,[r13 + 4 ] - xor ebp,[r13 + 8 ] - xor r8d,[r13 + 12] - - ; store at the destination - - mov [r15 + 0], esi - mov [r15 + 4], edi - mov [r15 + 8], ebp - mov [r15 + 12], r8d - - add r13, 16 ; pbSrc += 16 - add r15, 16 ; pbDst += 16 - - cmp r13, r14 - - jb SymCryptAesCtrMsb64AsmLoop - - ; - ; Copy back the chaining value; we only modified the last 8 bytes, so that is all we copy - ; - mov rsi,[rsp + AesCtrMsb64Frame.CallerP2Home] ; pbChainingState - mov [rsi + 8], ecx - mov [rsi + 12], edx - - ; - ; Wipe the chaining value on stack - ; - xor rax, rax - mov [rsp + AesCtrMsb64Frame.CallerP3Home], rax - mov [rsp + AesCtrMsb64Frame.CallerP4Home], rax - -SymCryptAesCtrMsb64NoData: - - BEGIN_EPILOGUE - - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - ret - - NESTED_END SymCryptAesCtrMsb64Asm, _TEXT - - -if 0 - LEAF_ENTRY SymCryptAesEncryptXmm, _TEXT - ; - ; rcx = expanded key - ; rdx = pbSrc - ; r8 = pbDst - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - movups xmm0,[rdx] - mov r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - - - AES_ENCRYPT_XMM - ; xmm0 contains the plaintext - ; rcx points to first round key to use - ; r10 is last key to use (unchanged) - - movups [r8],xmm0 - - ret - - LEAF_END SymCryptAesEncryptXmm, _TEXT -endif - -if 0 - - LEAF_ENTRY SymCryptAesDecryptXmm, _TEXT - ; - ; rcx = expanded key - ; rdx = pbSrc - ; r8 = pbDst - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - movups xmm0,[rdx] - mov r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey] - mov rcx, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - - - AES_DECRYPT_XMM - ; xmm0 contains the plaintext - ; rcx points to first round key to use - ; r10 is last key to use (unchanged) - - movups [r8],xmm0 - - ret - - LEAF_END SymCryptAesDecryptXmm, _TEXT -endif - -if 0 - - LEAF_ENTRY SymCryptAesCbcEncryptXmm, _TEXT -;VOID -;SYMCRYPT_CALL -;SymCryptAesCbcEncrypt( -; _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; SIZE_T cbData ); - -SymCryptAesCbcEncryptXmmFrame struct - -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? -CallerP4Home dq ? -cbData dq ? - -SymCryptAesCbcEncryptXmmFrame ends - - ; rcx = expanded key - ; rdx = pbChainingValue - ; r8 = pbSrc - ; r9 = pbDst - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - mov rax,[rsp + SymCryptAesCbcEncryptXmmFrame.cbData] - mov r11,rcx ; first round key - and rax, NOT 15 - jz SymCryptAesCbcEncryptXmmDone - - ; [rsp + 40] = cbData - - mov r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - - add rax, r8 ; rax = pbSrcEnd - - movups xmm0,[rdx] - -SymCryptAesCbcEncryptAsmXmmLoop: - movups xmm1,[r8] - add r8,16 - - pxor xmm0,xmm1 - - mov rcx, r11 - - AES_ENCRYPT_XMM - ; xmm0 contains the plaintext - ; rcx points to first round key to use - ; r10 is last key to use (unchanged) - ; Ciphertext ends up in xmm0 - - movups [r9],xmm0 - add r9, 16 - cmp r8, rax - jb SymCryptAesCbcEncryptAsmXmmLoop - - movups [rdx],xmm0 - -SymCryptAesCbcEncryptXmmDone: - - ret - - LEAF_END SymCryptAesCbcEncryptXmm, _TEXT - -endif - -if 0 ; Replaced with C code using intrinics. - - LEAF_ENTRY SymCryptAesDecryptXmm4, _TEXT - ; decrypt xmm0-3 - ; Registers used: xmm4 - ; rcx = first key, r10 = last key - ; rcx is destroyed - - movaps xmm4,[rcx] - lea rcx, [rcx+16] - pxor xmm0, xmm4 - pxor xmm1, xmm4 - pxor xmm2, xmm4 - pxor xmm3, xmm4 - -@@: movaps xmm4,[rcx] - add rcx,16 - aesdec xmm0, xmm4 - aesdec xmm1, xmm4 - aesdec xmm2, xmm4 - aesdec xmm3, xmm4 - - cmp rcx, r10 - jc @B - - movaps xmm4,[r10] - - aesdeclast xmm0, xmm4 - aesdeclast xmm1, xmm4 - aesdeclast xmm2, xmm4 - aesdeclast xmm3, xmm4 - - ret - - LEAF_END SymCryptAesDecryptXmm4, _TEXT - - - - NESTED_ENTRY SymCryptAesCbcDecryptXmm, _TEXT -;VOID -;SYMCRYPT_CALL -;SymCryptAesCbcDecrypt( -; _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, -; _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; SIZE_T cbData ) - -SymCryptAesCbcDecryptXmmFrame struct - -SaveXmm9 dq 2 dup (?) -SaveXmm8 dq 2 dup (?) -SaveXmm7 dq 2 dup (?) -SaveXmm6 dq 2 dup (?) -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? -CallerP4Home dq ? -cbData dq ? - -SymCryptAesCbcDecryptXmmFrame ends - - - rex_push_reg rbx - alloc_stack SymCryptAesCbcDecryptXmmFrame.SaveRbx - save_xmm128 xmm6, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm6 - save_xmm128 xmm7, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm7 - save_xmm128 xmm8, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm8 - save_xmm128 xmm9, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm9 - - END_PROLOGUE - - SYMCRYPT_CHECK_MAGIC rcx, SYMCRYPT_AES_EXPANDED_KEY - - ; rcx = key - ; rdx = chaining value - ; r8 = pbSrc - ; r9 = pbDst - ; [rsp + cbData] = cbData - - mov rbx,[rsp + SymCryptAesCbcDecryptXmmFrame.cbData] - and rbx, NOT 15 - jz SymCryptAesCbcDecryptXmmNoData - - - xor rax, rax ; offset into buffers - - mov r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey] - mov r11, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey] - - movups xmm5, [rdx] ; load IV - - - sub rbx, 64 ; cbData - 64 - jc SymCryptAesCbcDecryptXmmPartial - -SymCryptAesCbcDecryptXmm4Loop: - ; - ; xmm5 = IV - ; r8 = pbSrc - ; r9 = pbDst - ; rax = offset into buffer; we will process bytes rax..rax+63 in this iteration - ; rbx = cbData - 64 - ; rax <= rbx - ; - - movups xmm0,[r8 + rax] - movups xmm1,[r8 + rax + 16] - movaps xmm6, xmm0 - movups xmm2,[r8 + rax + 32] - movaps xmm7, xmm1 - movups xmm3,[r8 + rax + 48] - movaps xmm8, xmm2 - movaps xmm9, xmm3 - - mov rcx, r11 - call SymCryptAesDecryptXmm4 ; decrypt xmm0-3 using xmm4. rcx = first key, r10 = last key - - pxor xmm0, xmm5 - pxor xmm1, xmm6 - movups [r9 + rax], xmm0 - pxor xmm2, xmm7 - movups [r9 + rax + 16], xmm1 - pxor xmm3, xmm8 - movups [r9 + rax + 32], xmm2 - movups [r9 + rax + 48], xmm3 - - movaps xmm5, xmm9 - - add rax,64 - cmp rax,rbx - jbe SymCryptAesCbcDecryptXmm4Loop - - test rbx,63 - jz SymCryptAesCbcDecryptXmmDone ; cbData was a multiple of 64, no partial block - - sub rbx, rax ; rbx = bytes left - 64 - -SymCryptAesCbcDecryptXmmPartial: - ; - ; r8 = pbSrc - ; r9 = pbDst - ; rax = current offset - ; rbx = # bytes left to do - 64, # bytes left is nonzero - ; - - movups xmm0,[r8+rax] - movaps xmm6,xmm0 - cmp rbx,16 - 64 - jz SymCryptAesCbcDecryptXmmPartialLoadDone - - movups xmm1,[r8+rax+16] - movaps xmm7,xmm1 - cmp rbx,32 - 64 - jz SymCRyptAesCbcDecryptXmmPartialLoadDone - - movups xmm2,[r8+rax+32] - movaps xmm8, xmm2 - -SymCryptAesCbcDecryptXmmPartialLoadDone: - - mov rcx,r11 - call SymCryptAesDecryptXmm4 - - pxor xmm0, xmm5 - movups [r9 + rax], xmm0 - movaps xmm5, xmm6 - cmp rbx, 16 - 64 - jz SymCryptAesCbcDecryptXmmDone - - pxor xmm1, xmm6 - movups [r9 + rax + 16], xmm1 - movaps xmm5, xmm7 - cmp rbx, 32 - 64 - jz SymCryptAesCbcDecryptXmmDone - - pxor xmm2, xmm7 - movups [r9 + rax + 32], xmm2 - movaps xmm5, xmm8 - -SymCryptAesCbcDecryptXmmDone: - movups [rdx], xmm5 - -SymCryptAesCbcDecryptXmmNoData: - - movaps xmm6, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm6] - movaps xmm7, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm7] - movaps xmm8, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm8] - movaps xmm9, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm9] - - add rsp,SymCryptAesCbcDecryptXmmFrame.SaveRbx - - BEGIN_EPILOGUE - - pop rbx - ret - - NESTED_END SymCryptAesCbcDecryptXmm, _TEXT - -endif - -if 0 ; No longer used; replaced with C code using intrinsics that can be inlined. -; -;VOID -;SymCryptAes4SboxXmm( _In_reads_bytes_(4) PCBYTE pIn, _Out_writes_bytes_(4) PBYTE pOut ); -; - LEAF_ENTRY SymCryptAes4SboxXmm, _TEXT - ; - ;rcx points to source - ;rdx points to destination - ; - ;We only use volatile registers so we do not have to save any registers. - ; - - mov eax,[rcx] ; Use a register to avoid alignment issues - movd xmm0, eax - - movsldup xmm0, xmm0 ; copy [31:0] to [63:32] - aeskeygenassist xmm0, xmm0, 0 - - movd eax, xmm0 - mov [rdx], eax - - ret - - LEAF_END SymCryptAes4SboxXmm, _TEXT - - -; -;VOID -;AesCreateDecryptionRoundKeyXmm( _In_reads_bytes_(16) PCBYTE pEncryptionRoundKey, -; _Out_writes_bytes_(16) PBYTE pDecryptionRoundKey ); -; - LEAF_ENTRY SymCryptAesCreateDecryptionRoundKeyXmm, _TEXT - ;rcx points to source - ;rdx points to destination - - movups xmm0,[rcx] - aesimc xmm0, xmm0 - movups [rdx], xmm0 - ret - - LEAF_END SymCryptAesCreateDecryptionRoundKeyXmm, _TEXT - -endif - - end - diff --git a/lib/amd64/aesasm.symcryptasm b/lib/amd64/aesasm.symcryptasm new file mode 100644 index 0000000..7c7eded --- /dev/null +++ b/lib/amd64/aesasm.symcryptasm @@ -0,0 +1,964 @@ +// +// aesasm.symcryptasm Assembler code for fast AES on the amd64 +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// +// This code is derived from the AesFast implementation that +// Niels Ferguson wrote from scratch for BitLocker during Vista. +// That code is still in RSA32. +// + +// This file has only been partially translated into symcryptasm, external function calls use the +// generic symcryptasm registers to convert different calling conventions into using the fixed register +// layout used in aesasm. It seems likely that changing which registers AES state will be kept in in +// the macros could impact on performance. +// In general we don't want to touch this code going forward; the vast majority of amd64 CPUs have aesni +// and use the Xmm Aes codepaths. + +#include "symcryptasm_shared.cppasm" + +#include "symcrypt_version.inc" + +#define USE_BLOCK_FUNCTION 1 // Set to 1 to use block function, 0 to use block macro + +#if defined(SYMCRYPT_MASM) +extern SymCryptAesSboxMatrixMult:DWORD +extern SymCryptAesInvSboxMatrixMult:DWORD +extern SymCryptAesInvSbox:BYTE +extern SymCryptFatal:NEAR + +#elif defined(SYMCRYPT_GAS) + +#else +#error Unknown target assembly +#endif + +#if DBG +SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR )) +SET(SYMCRYPT_MAGIC_CONSTANT, (HEX(53316D76) + SYMCRYPT_CODE_VERSION)) // 0x53316D76 == 'S1mv' + +MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1) + mov rax, [ptr + struct_magic_offset] + sub rax, ptr + cmp rax, SYMCRYPT_MAGIC_CONSTANT + jz check_magic_label + mov arg_1, HEX(6D616763) // 0x6D616763 == 'magc' + call SymCryptFatal +check_magic_label: +MACRO_END() +#else +MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1) +MACRO_END() +#endif + +// +// Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure. +// + +// SYMCRYPT_AES_EXPANDED_KEY struct +// RoundKey dq 2*N_ROUND_KEYS_IN_AESKEY dup (?) // +// lastEncRoundKey dq ? // pointer to last enc round key +// lastDecRoundKey dq ? // pointer to last dec round key +// SYMCRYPT_MAGIC_FIELD +// SYMCRYPT_AES_EXPANDED_KEY ends + +SET(N_ROUND_KEYS_IN_AESKEY, 29) +SET(lastEncRoundKeyOffset, (29*16)) +SET(lastDecRoundKeyOffset, (29*16 + 8)) +SET(magicFieldOffset, (29*16 + 8 + 8)) + +// +// Shorthand for the 4 tables we will use +// We always use r11 to point to the (inv) SboxMatrixMult tables +// +#define SMM0 (r11 + 0) +#define SMM1 (r11 + 1024) +#define SMM2 (r11 + 2048) +#define SMM3 (r11 + 3072) + +#define ISMM0 (r11 + 0) +#define ISMM1 (r11 + 1024) +#define ISMM2 (r11 + 2048) +#define ISMM3 (r11 + 3072) + +MACRO_START(ENC_MIX, keyptr) + // + // Perform the unkeyed mixing function for encryption + // plus a key addition from the key pointer + // + // input:block is in eax, ebx, ecx, edx - r11 points to AesSboxMatrixMult + // New state ends up in eax, ebx, ecx, edx + // Used registers: esi, edi, ebp, r8 + + // + // We can use the e registers for the movzx as the + // upper 32 bits are automatically set to 0. This saves + // prefix bytes + // + // We use 32-bit registers to store the state. + // We tried using 64-bit registers, but the extra shifts + // cost too much. + // Using 32-bit throughout makes the key xor more expensive + // but we avoid having to combine the 32-bit halves into + // 64 bit. + // + + movzx esi,al + mov esi,[SMM0 + 4 * rsi] + movzx edi,ah + shr eax,16 + mov r8d,[SMM1 + 4 * rdi] + movzx ebp,al + mov ebp,[SMM2 + 4 * rbp] + movzx edi,ah + mov edi,[SMM3 + 4 * rdi] + + movzx eax,bl + xor edi,[SMM0 + 4 * rax] + movzx eax,bh + shr ebx,16 + xor esi,[SMM1 + 4 * rax] + movzx eax,bl + xor r8d,[SMM2 + 4 * rax] + movzx eax,bh + xor ebp,[SMM3 + 4 * rax] + + movzx eax,cl + xor ebp,[SMM0 + 4 * rax] + movzx ebx,ch + shr ecx,16 + xor edi,[SMM1 + 4 * rbx] + movzx eax,cl + xor esi,[SMM2 + 4 * rax] + movzx ebx,ch + xor r8d,[SMM3 + 4 * rbx] + + movzx eax,dl + xor r8d,[SMM0 + 4 * rax] + movzx ebx,dh + shr edx,16 + xor ebp,[SMM1 + 4 * rbx] + movzx eax,dl + xor edi,[SMM2 + 4 * rax] + movzx ebx,dh + xor esi,[SMM3 + 4 * rbx] + + mov eax, [keyptr] + mov ebx, [keyptr + 4] + xor eax, esi + mov ecx, [keyptr + 8] + xor ebx, edi + mov edx, [keyptr + 12] + xor ecx, ebp + xor edx, r8d +MACRO_END() + + +MACRO_START(DEC_MIX, keyptr) + // + // Perform the unkeyed mixing function for decryption + // + // input:block is in eax, ebx, ecx, edx + // r11 points to AesInvSboxMatrixMult + // New state ends up in esi, edi, ebp, r8d + + movzx esi,al + mov esi,[ISMM0 + 4 * rsi] + movzx edi,ah + shr eax,16 + mov edi,[ISMM1 + 4 * rdi] + movzx ebp,al + mov ebp,[ISMM2 + 4 * rbp] + movzx eax,ah + mov r8d,[ISMM3 + 4 * rax] + + movzx eax,bl + xor edi,[ISMM0 + 4 * rax] + movzx eax,bh + shr ebx,16 + xor ebp,[ISMM1 + 4 * rax] + movzx eax,bl + xor r8d,[ISMM2 + 4 * rax] + movzx eax,bh + xor esi,[ISMM3 + 4 * rax] + + movzx eax,cl + xor ebp,[ISMM0 + 4 * rax] + movzx ebx,ch + shr ecx,16 + xor r8d,[ISMM1 + 4 * rbx] + movzx eax,cl + xor esi,[ISMM2 + 4 * rax] + movzx ebx,ch + xor edi,[ISMM3 + 4 * rbx] + + movzx eax,dl + xor r8d,[ISMM0 + 4 * rax] + movzx ebx,dh + shr edx,16 + xor esi,[ISMM1 + 4 * rbx] + movzx eax,dl + xor edi,[ISMM2 + 4 * rax] + movzx ebx,dh + xor ebp,[ISMM3 + 4 * rbx] + + mov eax, [keyptr] + mov ebx, [keyptr + 4] + xor eax, esi + mov ecx, [keyptr + 8] + xor ebx, edi + mov edx, [keyptr + 12] + xor ecx, ebp + xor edx, r8d +MACRO_END() + +MACRO_START(AES_ENCRYPT_MACRO, AesEncryptMacroLoopLabel) + // + // Plaintext in eax, ebx, ecx, edx + // r9 points to first round key to use (modified) + // r10 is last key to use (unchanged) + // r11 points to SboxMatrixMult (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + // + // This macro is free to unroll the cipher completely, or to use a loop + // over r9 + // + + // + // xor in first round key + // + xor eax,[r9] + xor ebx,[r9+4] + xor ecx,[r9+8] + xor edx,[r9+12] + + add r9,32 + + // Do not unroll the loop at all because very few CPUs use this codepath so it's worth + // minimizing the binary size + +AesEncryptMacroLoopLabel: + // Block is eax, ebx, ecx, edx + // r9-16 points to next round key + + ENC_MIX r9-16 + + cmp r9,r10 + lea r9,[r9+16] + jc AesEncryptMacroLoopLabel + + // + // Now for the final round + // We use the fact that SboxMatrixMult[0] table is also + // an Sbox table if you use the second element of each entry. + // + // Result is in esi, edi, ebp, r8d + // + + movzx esi,al + movzx esi,byte ptr[r11 + 1 + 4*rsi] + movzx edi,ah + shr eax,16 + movzx r8d,byte ptr[r11 + 1 + 4*rdi] + movzx ebp,al + shl r8d,8 + movzx ebp,byte ptr[r11 + 1 + 4*rbp] + shl ebp,16 + movzx edi,ah + movzx edi,byte ptr[r11 + 1 + 4*rdi] + shl edi,24 + + movzx eax,bl + movzx eax,byte ptr[r11 + 1 + 4*rax] + or edi,eax + movzx eax,bh + shr ebx,16 + movzx eax,byte ptr[r11 + 1 + 4*rax] + shl eax,8 + or esi,eax + movzx eax,bl + movzx eax,byte ptr[r11 + 1 + 4*rax] + movzx ebx,bh + shl eax,16 + movzx ebx,byte ptr[r11 + 1 + 4*rbx] + or r8d,eax + shl ebx,24 + or ebp,ebx + + movzx eax,cl + movzx ebx,ch + movzx eax,byte ptr[r11 + 1 + 4*rax] + shr ecx,16 + movzx ebx,byte ptr[r11 + 1 + 4*rbx] + shl ebx,8 + or ebp,eax + or edi,ebx + movzx eax,cl + movzx eax,byte ptr[r11 + 1 + 4*rax] + movzx ebx,ch + movzx ebx,byte ptr[r11 + 1 + 4*rbx] + shl eax,16 + shl ebx,24 + or esi,eax + or r8d,ebx + + movzx eax,dl + movzx ebx,dh + movzx eax,byte ptr[r11 + 1 + 4*rax] + shr edx,16 + movzx ebx,byte ptr[r11 + 1 + 4*rbx] + shl ebx,8 + or r8d,eax + or ebp,ebx + movzx eax,dl + movzx eax,byte ptr[r11 + 1 + 4*rax] + movzx ebx,dh + movzx ebx,byte ptr[r11 + 1 + 4*rbx] + shl eax,16 + shl ebx,24 + or edi,eax + or esi,ebx + + // + // xor in final round key + // + + xor r8d,[r10+12] + xor esi,[r10] + xor edi,[r10+4] + xor ebp,[r10+8] +MACRO_END() + +MACRO_START(AES_DECRYPT_MACRO, AesDecryptMacroLoopLabel) + // + // Ciphertext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to InvSboxMatrixMult (unchanged) + // r12 points to InvSbox (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + // + + // + // xor in first round key + // + xor eax,[r9] + xor ebx,[r9+4] + xor ecx,[r9+8] + xor edx,[r9+12] + + add r9,32 + + // Do not unroll the loop at all because very few CPUs use this codepath so it's worth + // minimizing the binary size +AesDecryptMacroLoopLabel: + // Block is eax, ebx, ecx, edx + // r9-16 points to next round key + + DEC_MIX r9-16 + + cmp r9,r10 + lea r9,[r9+16] + jc AesDecryptMacroLoopLabel + + // + // Now for the final round + // Result is in esi, edi, ebp, r8d + // + + movzx esi,al + movzx esi,byte ptr[r12 + rsi] + movzx edi,ah + shr eax,16 + movzx edi,byte ptr[r12 + rdi] + movzx ebp,al + shl edi,8 + movzx ebp,byte ptr[r12 + rbp] + shl ebp,16 + movzx eax,ah + movzx r8d,byte ptr[r12 + rax] + shl r8d,24 + + movzx eax,bl + movzx eax,byte ptr[r12 + rax] + or edi,eax + movzx eax,bh + shr ebx,16 + movzx eax,byte ptr[r12 + rax] + shl eax,8 + or ebp,eax + movzx eax,bl + movzx eax,byte ptr[r12 + rax] + movzx ebx,bh + shl eax,16 + movzx ebx,byte ptr[r12 + rbx] + or r8d,eax + shl ebx,24 + or esi,ebx + + movzx eax,cl + movzx ebx,ch + movzx eax,byte ptr[r12 + rax] + shr ecx,16 + movzx ebx,byte ptr[r12 + rbx] + shl ebx,8 + or ebp,eax + or r8d,ebx + movzx eax,cl + movzx eax,byte ptr[r12 + rax] + movzx ebx,ch + movzx ebx,byte ptr[r12 + rbx] + shl eax,16 + shl ebx,24 + or esi,eax + or edi,ebx + + movzx eax,dl + movzx ebx,dh + movzx eax,byte ptr[r12 + rax] + shr edx,16 + movzx ebx,byte ptr[r12 + rbx] + shl ebx,8 + or r8d,eax + or esi,ebx + movzx eax,dl + movzx eax,byte ptr[r12 + rax] + movzx ebx,dh + movzx ebx,byte ptr[r12 + rbx] + shl eax,16 + shl ebx,24 + or edi,eax + or ebp,ebx + + // + // xor in final round key + // + + xor esi,[r10] + xor edi,[r10+4] + xor ebp,[r10+8] + xor r8d,[r10+12] +MACRO_END() + +#if USE_BLOCK_FUNCTION + + // + // We use a block function, the AES_ENCRYPT macro merely calls the function + // + +MACRO_START(AES_ENCRYPT, loopLabel) + call SymCryptAesEncryptAsmInternal +MACRO_END() + +MACRO_START(AES_DECRYPT, loopLabel) + call SymCryptAesDecryptAsmInternal +MACRO_END() + +//======================================== +// SymCryptAesEncryptAsmInternal +// +// Internal AES encryption routine with modified calling convention. +// This function has the exact same calling convention as the AES_ENCRYPT_MACRO + +FUNCTION_START(SymCryptAesEncryptAsmInternal, 0, 0) + + AES_ENCRYPT_MACRO SymCryptAesEncryptAsmInternalLoop + +FUNCTION_END(SymCryptAesEncryptAsmInternal) + +//======================================== +// SymCryptAesDecryptAsmInternal +// +// Internal AES encryption routine with modified calling convention. +// This function has the exact same calling convention as the AES_DECRYPT_MACRO +// + +FUNCTION_START(SymCryptAesDecryptAsmInternal, 0, 0) + + AES_DECRYPT_MACRO SymCryptAesDecryptAsmInternalLoop + +FUNCTION_END(SymCryptAesDecryptAsmInternal) + +#else + + // + // No block function, use the macro directly + // + +MACRO_START(AES_ENCRYPT, loopLabel) + AES_ENCRYPT_MACRO loopLabel +MACRO_END() + +MACRO_START(AES_DECRYPT, loopLabel) + AES_DECRYPT_MACRO loopLabel +MACRO_END() + +#endif + +// +//VOID +//SYMCRYPT_CALL +//SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, +// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext, +// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext ) +// + +NESTED_FUNCTION_START(SymCryptAesEncryptAsm, 3, 15) + + SYMCRYPT_CHECK_MAGIC SymCryptAesEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1 + + // Here we convert from whatever calling convention we are called from externally to our + // AES internal calling convention. + // We need to be careful that we don't overwrite an argument register before we copy it to + // the place it is needed internally in the AES functions. + // There is no automatic method for checking we do this correctly - modify with care! + // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are: + // rcx, rdx, r8, rdi, rsi + + mov r10, [Q1 + lastEncRoundKeyOffset] + mov r9, Q1 + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3 + + // + // Load the plaintext + // + mov eax,[Q2 ] + mov ebx,[Q2 + 4] + mov ecx,[Q2 + 8] + mov edx,[Q2 + 12] + + lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)] + + AES_ENCRYPT SymCryptAesEncryptAsmLoop + // Plaintext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to SboxMatrixMult (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + + // retrieve pbCiphertext using Q0 because it is always rax regardless of calling convention + mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)] + mov [Q0 ], esi + mov [Q0 + 4], edi + mov [Q0 + 8], ebp + mov [Q0 + 12], r8d + +NESTED_FUNCTION_END(SymCryptAesEncryptAsm) + + +// +//VOID +//SYMCRYPT_CALL +//SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, +// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext, +// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext ) + +NESTED_FUNCTION_START(SymCryptAesDecryptAsm, 3, 15) + + SYMCRYPT_CHECK_MAGIC SymCryptAesDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1 + + // Here we convert from whatever calling convention we are called from externally to our + // AES internal calling convention. + // We need to be careful that we don't overwrite an argument register before we copy or use + // the value appropriately for use in the AES functions. + // There is no automatic method for checking we do this correctly - modify with care! + // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are: + // rcx, rdx, r8, rdi, rsi + + mov r9,[Q1 + lastEncRoundKeyOffset] + mov r10,[Q1 + lastDecRoundKeyOffset] + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3 + + mov eax,[Q2 ] + mov ebx,[Q2 + 4] + mov ecx,[Q2 + 8] + mov edx,[Q2 + 12] + + lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)] + lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)] + + AES_DECRYPT SymCryptAesDecryptAsmLoop + // Ciphertext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to InvSboxMatrixMult (unchanged) + // r12 points to InvSbox (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + + // retrieve pbPlaintext using Q0 because it is always rax regardless of calling convention + mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)] + mov [Q0 ], esi + mov [Q0 + 4], edi + mov [Q0 + 8], ebp + mov [Q0 + 12], r8d + +NESTED_FUNCTION_END(SymCryptAesDecryptAsm) + +//VOID +//SYMCRYPT_CALL +//SymCryptAesCbcEncrypt( +// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, +// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, +// _In_reads_bytes_( cbData ) PCBYTE pbSrc, +// _Out_writes_bytes_( cbData ) PBYTE pbDst, +// SIZE_T cbData ) + +NESTED_FUNCTION_START(SymCryptAesCbcEncryptAsm, 5, 15) + + // Here we convert from whatever calling convention we are called from externally to our + // AES internal calling convention. + // We need to be careful that we don't overwrite an argument register before we copy or use + // the value appropriately for use in the AES functions. + // There is no automatic method for checking we do this correctly - modify with care! + // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are: + // rcx, rdx, r8, r9, r10, rdi, rsi + + SYMCRYPT_CHECK_MAGIC SymCryptAesCbcEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1 + + and Q5, NOT 15 // only deal with whole # blocks + jz SymCryptAesCbcEncryptNoData + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue + mov rax, Q2 // rax = pbChainingValue + mov r13, Q3 // r13 = pbSrc + + mov r15, Q5 // r15 = cbData + mov r14, Q4 // r14 = pbDst + + add r15, Q3 // r15 = pbSrcEnd + + mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key + mov r12,Q1 // r12 = first round key to use + + // + // Load the chaining state from pbChainingValue + // + mov esi,[rax ] + mov edi,[rax + 4] + mov ebp,[rax + 8] + mov r8d,[rax + 12] + + lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)] + +ALIGN(16) +SymCryptAesCbcEncryptAsmLoop: + // Loop register setup + // r10 = last round key to use + // r12 = first round key to use + // r13 = pbSrc + // r14 = pbDst + // r15 = pbSrcEnd + + // chaining state in (esi,edi,ebp,r8d) + + mov eax, [r13] + mov r9, r12 + mov ebx, [r13+4] + xor eax, esi + mov ecx, [r13+8] + xor ebx, edi + xor ecx, ebp + mov edx, [r13+12] + xor edx, r8d + + add r13, 16 + + + AES_ENCRYPT SymCryptAesCbcEncryptAsmInnerLoop + // + // Plaintext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to SboxMatrixMult (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + // + + mov [r14], esi + mov [r14+4], edi + mov [r14+8], ebp + mov [r14+12], r8d + + add r14, 16 + + cmp r13, r15 + + jb SymCryptAesCbcEncryptAsmLoop + + + // + // Update the chaining value + // + mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)] + mov [Q0], esi + mov [Q0+4], edi + mov [Q0+8], ebp + mov [Q0+12], r8d + +SymCryptAesCbcEncryptNoData: + +NESTED_FUNCTION_END(SymCryptAesCbcEncryptAsm) + + +//VOID +//SYMCRYPT_CALL +//SymCryptAesCbcDecrypt( +// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, +// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, +// _In_reads_bytes_( cbData ) PCBYTE pbSrc, +// _Out_writes_bytes_( cbData ) PBYTE pbDst, +// SIZE_T cbData ) + +NESTED_FUNCTION_START(SymCryptAesCbcDecryptAsm, 5, 15) + + // Here we convert from whatever calling convention we are called from externally to our + // AES internal calling convention. + // We need to be careful that we don't overwrite an argument register before we copy or use + // the value appropriately for use in the AES functions. + // There is no automatic method for checking we do this correctly - modify with care! + // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are: + // rcx, rdx, r8, r9, r10, rdi, rsi + + SYMCRYPT_CHECK_MAGIC SymCryptAesCbcDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1 + + and Q5, NOT 15 + jz SymCryptAesCbcDecryptNoData + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q3 // save pbSrc + + lea r14, [Q5 - 16] + lea r15, [Q4 + r14] // r15 = pbDst pointed to last block + add r14, Q3 // r14 = pbSrc pointed to last block + + mov r13,[Q1 + lastEncRoundKeyOffset] + mov r10,[Q1 + lastDecRoundKeyOffset] + + lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)] + lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)] + + // + // Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later) + // + mov eax,[r14] + mov ebx,[r14+4] + mov ecx,[r14+8] + mov edx,[r14+12] + + mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], eax + mov [rsp + GET_MEMSLOT_OFFSET(slot2)+4], ebx + mov [rsp + GET_MEMSLOT_OFFSET(slot3) ], ecx + mov [rsp + GET_MEMSLOT_OFFSET(slot3)+4], edx + + jmp SymCryptAesCbcDecryptAsmLoopEntry + +ALIGN(16) + +SymCryptAesCbcDecryptAsmLoop: + // Loop register setup + // r13 = first round key to use + // r14 = pbSrc + // r15 = pbDst + // [slot1] = pbSrcStart + + // current ciphertext block (esi,edi,ebp,r8d) + + mov eax,[r14-16] + mov ebx,[r14-12] + xor esi,eax + mov ecx,[r14-8] + xor edi,ebx + mov [r15],esi + mov edx,[r14-4] + xor ebp,ecx + mov [r15+4],edi + xor r8d,edx + mov [r15+8],ebp + mov [r15+12],r8d + + sub r14,16 + sub r15,16 + +SymCryptAesCbcDecryptAsmLoopEntry: + + mov r9, r13 + + AES_DECRYPT SymCryptAesCbcDecryptAsmInnerLoop + // + // Ciphertext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to InvSboxMatrixMult (unchanged) + // r12 points to InvSbox (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + // + + cmp r14, [rsp + GET_MEMSLOT_OFFSET(slot1)] // pbSrc + ja SymCryptAesCbcDecryptAsmLoop + + mov rbx,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingValue + xor esi,[rbx] + xor edi,[rbx+4] + xor ebp,[rbx+8] + xor r8d,[rbx+12] + + mov [r15], esi + mov [r15+4], edi + mov [r15+8], ebp + mov [r15+12], r8d + + // + // Update the chaining value to the last ciphertext block + // + mov rax,[rsp + GET_MEMSLOT_OFFSET(slot2)] + mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot3)] + mov [rbx], rax + mov [rbx+8], rcx + +SymCryptAesCbcDecryptNoData: + +NESTED_FUNCTION_END(SymCryptAesCbcDecryptAsm) + +//VOID +//SYMCRYPT_CALL +//SymCryptAesCtrMsb64( +// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, +// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, +// _In_reads_bytes_( cbData ) PCBYTE pbSrc, +// _Out_writes_bytes_( cbData ) PBYTE pbDst, +// SIZE_T cbData ) + +NESTED_FUNCTION_START(SymCryptAesCtrMsb64Asm, 5, 15) + + // Here we convert from whatever calling convention we are called from externally to our + // AES internal calling convention. + // We need to be careful that we don't overwrite an argument register before we copy or use + // the value appropriately for use in the AES functions. + // There is no automatic method for checking we do this correctly - modify with care! + // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are: + // rcx, rdx, r8, r9, r10, rdi, rsi + + SYMCRYPT_CHECK_MAGIC SymCryptAesCtrMsb64AsmCheckMagic, Q1, magicFieldOffset, Q1 + + and Q5, NOT 15 // only deal with whole # blocks + jz SymCryptAesCtrMsb64NoData + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingState + mov rax, Q2 // rax = pbChainingValue + mov r13, Q3 // r13 = pbSrc + mov r14, Q5 // r14 = cbData + mov r15, Q4 // r15 = pbDst + add r14, Q3 // r14 = cbData + pbSrc = pbSrcEnd + + mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key + mov r12,Q1 // r12 = first round key to use + + + lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)] + + // + // Load the chaining state + // + mov rcx, [rax + 8] + mov rax, [rax ] + + // + // Store it in our local copy (we have no register free to keep pbChainingState in) + // + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax + mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rcx + + // + // Move to the right registers + // + mov rbx, rax + mov rdx, rcx + shr rbx, 32 + shr rdx, 32 + +ALIGN(16) +SymCryptAesCtrMsb64AsmLoop: + // Loop invariant + // Current chaining state is in (eax, ebx, ecx, edx) + // r10 = last round key to use + // r11 = SboxMatrixMult + // r12 = first round key to use + // r13 = pbSrc + // r14 = pbSrcEnd + // r15 = pbDst + // [slot1..slot2] = 16 bytes chaining state block + + mov r9, r12 + + AES_ENCRYPT SymCryptAesCtrMsb64AsmInnerLoop + // + // Plaintext in eax, ebx, ecx, edx + // r9 points to first round key to use + // r10 is last key to use (unchanged) + // r11 points to SboxMatrixMult (unchanged) + // Ciphertext ends up in esi, edi, ebp, r8d + // + + // To improve latency, we FIRST + // load the chaining state, increment the counter, and write it back. + // leave the state in the (eax, ebx, ecx, edx) registers + + mov eax,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 0] + mov ebx,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 4] + mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot2) ] + bswap rcx + add rcx, 1 + bswap rcx + mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], rcx + mov rdx, rcx + shr rdx, 32 + + // THEN we process the XOR of the key stream with the data + // This order is faster as we need to have the chaining state done + // before we can proceed, but there are no dependencies on the data result + // So we can loop back to the beginning while the data stream read/writes are + // still in flight. + // + // xor with the source stream + + xor esi,[r13 + 0 ] + xor edi,[r13 + 4 ] + xor ebp,[r13 + 8 ] + xor r8d,[r13 + 12] + + // store at the destination + + mov [r15 + 0], esi + mov [r15 + 4], edi + mov [r15 + 8], ebp + mov [r15 + 12], r8d + + add r13, 16 // pbSrc += 16 + add r15, 16 // pbDst += 16 + + cmp r13, r14 + + jb SymCryptAesCtrMsb64AsmLoop + + // + // Copy back the chaining value - we only modified the last 8 bytes, so that is all we copy + // + mov rsi,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingState + mov [rsi + 8], ecx + mov [rsi + 12], edx + + // + // Wipe the chaining value on stack + // + xor rax, rax + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax + mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rax + +SymCryptAesCtrMsb64NoData: + +NESTED_FUNCTION_END(SymCryptAesCtrMsb64Asm) + +FILE_END() diff --git a/lib/amd64/fdef369_asm.asm b/lib/amd64/fdef369_asm.asm deleted file mode 100644 index f13d995..0000000 --- a/lib/amd64/fdef369_asm.asm +++ /dev/null @@ -1,529 +0,0 @@ -; -; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format -; -; This file contains alternative routines that are used for modular computations -; where the modulus is 257-384 or 513-576 bits long. -; (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64) -; -; The immediate advantage is that it improves EC performance on 384, and 521-bit curves. -; -; Most of this code is a direct copy of the default code. -; AMD64 digits are now 512 bits. -; We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values -; are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -include ksamd64.inc - -include symcrypt_version.inc -include symcrypt_magic.inc - - - -include C_asm_shared.inc - -; A digit consists of 4 words of 64 bits each - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdef369RawAddAsm( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, -; UINT32 nDigits ); - - LEAF_ENTRY SymCryptFdef369RawAddAsm, _TEXT - - ; rcx = Src1 - ; rdx = Src2 - ; r8 = Dst - ; r9 = nDigits - - add r9, 1 - xor rax, rax - xor r10, r10 - - ; Cy = 0 - -SymCryptFdef369RawAddAsmLoop: - ; carry is in the carry flag - mov rax,[rcx] - adc rax,[rdx] - mov [r8],rax - - mov rax,[rcx + 8] - adc rax,[rdx + 8] - mov [r8 + 8], rax - - mov rax,[rcx + 16] - adc rax,[rdx + 16] - mov [r8 + 16], rax - - lea rcx, [rcx + 24] - lea rdx, [rdx + 24] - lea r8, [r8 + 24] - dec r9d - jnz SymCryptFdef369RawAddAsmLoop - - mov rax, r10 - adc rax, r10 - - ret - - LEAF_END SymCryptFdef369RawAddAsm, _TEXT - - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdefRawSubAsm( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, -; UINT32 nDigits ); - - LEAF_ENTRY SymCryptFdef369RawSubAsm, _TEXT - - ; rcx = Src1 - ; rdx = Src2 - ; r8 = Dst - ; r9 = nDigits - - add r9, 1 - xor rax, rax - xor r10, r10 - -SymCryptFdef369RawSubAsmLoop: - ; carry is in the carry flag - mov rax,[rcx] - sbb rax,[rdx] - mov [r8],rax - - mov rax,[rcx + 8] - sbb rax,[rdx + 8] - mov [r8 + 8], rax - - mov rax,[rcx + 16] - sbb rax,[rdx + 16] - mov [r8 + 16], rax - - lea rcx, [rcx + 24] - lea rdx, [rdx + 24] - lea r8, [r8 + 24] - dec r9d - jnz SymCryptFdef369RawSubAsmLoop - - mov rax, r10 - adc rax, r10 - - ret - - LEAF_END SymCryptFdef369RawSubAsm, _TEXT - - - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefMaskedCopy( -; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, -; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, -; UINT32 nDigits, -; UINT32 mask ) - - LEAF_ENTRY SymCryptFdef369MaskedCopyAsm, _TEXT - - add r8d, 1 - movsxd r9, r9d - -SymCryptFdef369MaskedCopyAsmLoop: - mov rax, [rcx] - mov r10, [rdx] - xor rax, r10 - and rax, r9 - xor rax, r10 - mov [rdx], rax - - mov rax, [rcx + 8] - mov r10, [rdx + 8] - xor rax, r10 - and rax, r9 - xor rax, r10 - mov [rdx + 8], rax - - mov rax, [rcx + 16] - mov r10, [rdx + 16] - xor rax, r10 - and rax, r9 - xor rax, r10 - mov [rdx + 16], rax - - ; Move on to the next digit - - add rcx, 24 - add rdx, 24 - sub r8d, 1 - jnz SymCryptFdef369MaskedCopyAsmLoop - ret - - LEAF_END SymCryptFdef369MaskedCopyAsm, _TEXT - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; UINT32 nDigits1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits2, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) - -SymCryptFdef369RawMulAsm_Frame struct - SavedRbx dq ? - SavedRdi dq ? - SavedRsi dq ? - SavedR13 dq ? - SavedR12 dq ? - returnaddress dq ? - Arg1Home dq ? - Arg2Home dq ? - Arg3Home dq ? - Arg4Home dq ? - pDst dq ? - -SymCryptFdef369RawMulAsm_Frame ends - - NESTED_ENTRY SymCryptFdef369RawMulAsm, _TEXT - - rex_push_reg rbx - push_reg r12 - push_reg r13 - push_reg rsi - push_reg rdi - - END_PROLOGUE - - ; Basic structure: - ; for each word in Src1: - ; Dst += Src2 * word - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src1 to multiply with - ; rcx = pSrc1 (updated in outer loop) - ; rdx = tmp for mul - ; rsi = inner loop pointer into pSrc2 - ; rdi = inner loop pointer into pDst - ; r8 = pSrc2 - ; r9 = nDigits2 - ; r10 = pDst (incremented in outer loop) - ; r11 = # words left from Src1 to process - ; r12 = carry - ; r13 = inner loop counter - - - add edx, 1 - add r9d, 1 - lea r11d, [edx + 2*edx] ; nDigits1 * 3 = # words in Src1 to process - mov r10, [rsp + SymCryptFdef369RawMulAsm_Frame.pDst ] - - ; Outer loop invariant established: rcx, r8, r9, r10 - - - mov rsi, r8 ; rsi = pSrc2 - mov rdi, r10 ; rdi = pDst + outer loop ctr - mov rbx, [rcx] ; mulword - xor r12, r12 - mov r13d, r9d - - ; First inner loop overwrites Dst, which avoids adding the current Dst value - -SymCryptFdef369RawMulAsmLoop1: - mov rax, [rsi] - mul rbx - add rax, r12 - adc rdx, 0 - mov [rdi], rax - mov r12, rdx - - mov rax, [rsi + 8] - mul rbx - add rax, r12 - adc rdx, 0 - mov [rdi + 8], rax - mov r12, rdx - - mov rax, [rsi + 16] - mul rbx - add rax, r12 - adc rdx, 0 - mov [rdi + 16], rax - mov r12, rdx - - add rsi, 24 - add rdi, 24 - sub r13d,1 - jnz SymCryptFdef369RawMulAsmLoop1 - - mov [rdi], rdx ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11d, 1 - -SymCryptFdef369RawMulAsmLoopOuter: - - add rcx, 8 ; move to next word of pSrc1 - add r10, 8 ; move Dst pointer one word over - mov rbx, [rcx] - mov rsi, r8 - mov rdi, r10 - xor r12, r12 - mov r13d, r9d - -SymCryptFdef369RawMulAsmLoop2: - mov rax, [rsi] - mul rbx - add rax, [rdi] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [rdi], rax - mov r12, rdx - - mov rax, [rsi + 8] - mul rbx - add rax, [rdi + 8] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [rdi + 8], rax - mov r12, rdx - - mov rax, [rsi + 16] - mul rbx - add rax, [rdi + 16] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [rdi + 16], rax - mov r12, rdx - - add rsi, 24 - add rdi, 24 - sub r13d,1 - jnz SymCryptFdef369RawMulAsmLoop2 - - mov [rdi], rdx ; write next word. (stays within Dst buffer) - - sub r11d, 1 - jnz SymCryptFdef369RawMulAsmLoopOuter - - BEGIN_EPILOGUE - - pop rdi - pop rsi - pop r13 - pop r12 - pop rbx - ret - - NESTED_END SymCryptFdef369RawMulAsm, _TEXT - - - - - - -;VOID -;SymCryptFdefMontgomeryReduceAsm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdef369MontgomeryReduceAsm, _TEXT - - rex_push_reg rbx - push_reg r12 - push_reg r13 - push_reg r14 - push_reg rsi - push_reg rdi - push_reg rbp - - END_PROLOGUE - - mov r11, rdx ; r11 = pSrc - mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits - add ebp, 1 - mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - lea edi, [ebp + 2*ebp] ; outer loop counter, in words - - xor r14d, r14d - - ; General register allocations - ; rax = multiply result - ; rbx = multiplier in inner loop - ; rcx = pointer to modulus value - ; rdx = multiply result - ; rsi = loop counter - ; rdi = loop counter - ; rbp = nDigits - ; r8 = pDst - ; r9 = running pointer in Src - ; r10 = running pointer in Mod - ; r11 = pSrc (updated in outer loop) - ; r12 = carry - ; r13 = pmMod->tm.montgomery.inv64 - ; r14 = carry out from last word of previous loop iteration - - -SymCryptFdef369MontgomeryReduceAsmOuterLoop: - - ; start decoder with a few simple instructions, including at least one that requires - ; a uop execution and is on the critical path - - mov rbx, [r11] ; fetch word of Src we want to set to zero - mov r10, r11 - mov r9, rcx - - imul rbx, r13 ; lower word is same for signed & unsigned multiply - - mov esi, ebp - xor r12d, r12d - -SymCryptFdef369MontgomeryReduceAsmInnerloop: - ; rax = mul scratch - ; rbx = multiplier - ; rcx = pointer to modulus value - ; rdx = mul scratch - ; edi = outer loop counter (words) - ; esi = inner loop counter (digits) - ; r9 = running ptr to modulus - ; r10 = running ptr to input/scratch - ; r12 = carry (64 bits) - - mov rax, [r9] - mul rbx - add rax, [r10] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [r10], rax - mov r12, rdx - - mov rax, [r9 + 8] - mul rbx - add rax, [r10 + 8] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [r10 + 8], rax - mov r12, rdx - - mov rax, [r9 + 16] - mul rbx - add rax, [r10 + 16] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [r10 + 16], rax - mov r12, rdx - - add r9, 24 - add r10, 24 - sub esi,1 - jnz SymCryptFdef369MontgomeryReduceAsmInnerloop - - add r12, r14 - mov r14d, 0 - adc r14, 0 - add r12, [r10] - adc r14, 0 - mov [r10], r12 - - add r11, 8 - - sub edi, 1 - jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop - - ; - ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result - ; - - ; First we compute the pSrc result minus the modulus into the destination - mov esi, ebp ; loop ctr - mov r10, r11 ; pSrc - mov r9, rcx ; pMod - mov r12, r8 ; pDst - - ; Cy = 0 because the last 'sub edi,1' resulted in 0 - -SymCryptFdef369MontgomeryReduceAsmSubLoop: - mov rax,[r10] - sbb rax,[r9] - mov [r12], rax - - mov rax,[r10 + 8] - sbb rax,[r9 + 8] - mov [r12 + 8], rax - - mov rax,[r10 + 16] - sbb rax,[r9 + 16] - mov [r12 + 16], rax - - lea r10,[r10+24] - lea r9, [r9 +24] - lea r12,[r12+24] - dec esi - jnz SymCryptFdef369MontgomeryReduceAsmSubLoop - - ; Finally a masked copy form pSrc to pDst - ; copy if: r14 == 0 && Cy = 1 - sbb r14, 0 ; mask (64 bits) - - -SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop: - mov rax, [r11] - mov rsi, [r8] - xor rax, rsi - and rax, r14 - xor rax, rsi - mov [r8], rax - - mov rax, [r11 + 8] - mov rsi, [r8 + 8] - xor rax, rsi - and rax, r14 - xor rax, rsi - mov [r8 + 8], rax - - mov rax, [r11 + 16] - mov rsi, [r8 + 16] - xor rax, rsi - and rax, r14 - xor rax, rsi - mov [r8 + 16], rax - - ; Move on to the next digit - - add r11, 24 - add r8, 24 - sub ebp, 1 - jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop - - BEGIN_EPILOGUE - - pop rbp - pop rdi - pop rsi - pop r14 - pop r13 - pop r12 - pop rbx - ret - - NESTED_END SymCryptFdef369MontgomeryReduceAsm, _TEXT - - end - diff --git a/lib/amd64/fdef369_asm.symcryptasm b/lib/amd64/fdef369_asm.symcryptasm new file mode 100644 index 0000000..61ae581 --- /dev/null +++ b/lib/amd64/fdef369_asm.symcryptasm @@ -0,0 +1,451 @@ +// +// fdef_369asm.asm Assembler code for large integer arithmetic in the default data format +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// This file contains alternative routines that are used for modular computations +// where the modulus is 257-384 or 513-576 bits long. +// (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64) +// +// The immediate advantage is that it improves EC performance on 384, and 521-bit curves. +// +// Most of this code is a direct copy of the default code. +// AMD64 digits are now 512 bits. +// We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values +// are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. + +#include "symcryptasm_shared.cppasm" + +// A digit consists of 4 words of 64 bits each + +//UINT32 +//SYMCRYPT_CALL +// SymCryptFdef369RawAddAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, +// UINT32 nDigits ) +FUNCTION_START(SymCryptFdef369RawAddAsm, 4, 5) + + inc D4 + xor Q0, Q0 + +SymCryptFdef369RawAddAsmLoop: + // carry is in the carry flag + mov Q0,[Q1] + adc Q0,[Q2] + mov [Q3],Q0 + + mov Q0,[Q1 + 8] + adc Q0,[Q2 + 8] + mov [Q3 + 8], Q0 + + mov Q0,[Q1 + 16] + adc Q0,[Q2 + 16] + mov [Q3 + 16], Q0 + + lea Q1, [Q1 + 24] + lea Q2, [Q2 + 24] + lea Q3, [Q3 + 24] + dec D4 + jnz SymCryptFdef369RawAddAsmLoop + + mov Q0, 0 + adc Q0, Q0 + +FUNCTION_END(SymCryptFdef369RawAddAsm) + +// UINT32 +// SYMCRYPT_CALL +// SymCryptFdef369RawSubAsm( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, +// UINT32 nDigits ) + +FUNCTION_START(SymCryptFdef369RawSubAsm, 4, 5) + + inc D4 + xor Q0, Q0 + +SymCryptFdef369RawSubAsmLoop: + // carry is in the carry flag + mov Q0,[Q1] + sbb Q0,[Q2] + mov [Q3],Q0 + + mov Q0,[Q1 + 8] + sbb Q0,[Q2 + 8] + mov [Q3 + 8], Q0 + + mov Q0,[Q1 + 16] + sbb Q0,[Q2 + 16] + mov [Q3 + 16], Q0 + + lea Q1, [Q1 + 24] + lea Q2, [Q2 + 24] + lea Q3, [Q3 + 24] + dec D4 + jnz SymCryptFdef369RawSubAsmLoop + + mov Q0, 0 + adc Q0, Q0 + +FUNCTION_END(SymCryptFdef369RawSubAsm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdef369MaskedCopyAsm( +// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, +// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, +// UINT32 nDigits, +// UINT32 mask ) + +FUNCTION_START(SymCryptFdef369MaskedCopyAsm, 4, 6) + + inc D3 + movsxd Q4, D4 + +SymCryptFdef369MaskedCopyAsmLoop: + mov Q0, [Q1] + mov Q5, [Q2] + xor Q0, Q5 + and Q0, Q4 + xor Q0, Q5 + mov [Q2], Q0 + + mov Q0, [Q1 + 8] + mov Q5, [Q2 + 8] + xor Q0, Q5 + and Q0, Q4 + xor Q0, Q5 + mov [Q2 + 8], Q0 + + mov Q0, [Q1 + 16] + mov Q5, [Q2 + 16] + xor Q0, Q5 + and Q0, Q4 + xor Q0, Q5 + mov [Q2 + 16], Q0 + + // Move on to the next digit + + add Q1, 24 + add Q2, 24 + dec D3 + jnz SymCryptFdef369MaskedCopyAsmLoop + +FUNCTION_END(SymCryptFdef369MaskedCopyAsm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdef369RawMulAsm( +// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// UINT32 nDigits1, +// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits2, +// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdef369RawMulAsm, 5, 11) + + // Basic structure: + // for each word in Src1: + // Dst += Src2 * word + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = pSrc1 (updated in outer loop) + // D2 = # words left from Src1 to process + // Q3 = pSrc2 + // Q4 = nDigits2 + // Q5 = pDst (incremented in outer loop) + // Q6 = inner loop pointer into pSrc2 + // Q7 = inner loop pointer into pDst + // Q8 = word from Src1 to multiply with + // Q9 = carry + // D10 = inner loop counter + + inc D2 + inc D4 + lea D2, [D2 + 2*D2] // nDigits1 * 3 = # words in Src1 to process + + // Outer loop invariant established: Q1, Q3, D4, Q5 + + mov Q6, Q3 // Q6 = pSrc2 + mov Q7, Q5 // Q7 = pDst + outer loop ctr + mov Q8, [Q1] // mulword + xor Q9, Q9 + mov D10, D4 + + // First inner loop overwrites Dst, which avoids adding the current Dst value + +ALIGN(16) + +SymCryptFdef369RawMulAsmLoop1: + mov Q0, [Q6] + mul Q8 + add Q0, Q9 + adc QH, 0 + mov [Q7], Q0 + mov Q9, QH + + mov Q0, [Q6 + 8] + mul Q8 + add Q0, Q9 + adc QH, 0 + mov [Q7 + 8], Q0 + mov Q9, QH + + mov Q0, [Q6 + 16] + mul Q8 + add Q0, Q9 + adc QH, 0 + mov [Q7 + 16], Q0 + mov Q9, QH + + add Q6, 24 + add Q7, 24 + dec D10 + jnz SymCryptFdef369RawMulAsmLoop1 + + mov [Q7], QH // write last word, cannot overflow because Dst is at least 2 digits long + + dec D2 + +ALIGN(16) + +SymCryptFdef369RawMulAsmLoopOuter: + + add Q1, 8 // move to next word of pSrc1 + add Q5, 8 // move Dst pointer one word over + mov Q8, [Q1] + mov Q6, Q3 + mov Q7, Q5 + xor Q9, Q9 + mov D10, D4 + +ALIGN(16) + +SymCryptFdef369RawMulAsmLoop2: + mov Q0, [Q6] + mul Q8 + add Q0, [Q7] + adc QH, 0 + add Q0, Q9 + adc QH, 0 + mov [Q7], Q0 + mov Q9, QH + + mov Q0, [Q6 + 8] + mul Q8 + add Q0, [Q7 + 8] + adc QH, 0 + add Q0, Q9 + adc QH, 0 + mov [Q7 + 8], Q0 + mov Q9, QH + + mov Q0, [Q6 + 16] + mul Q8 + add Q0, [Q7 + 16] + adc QH, 0 + add Q0, Q9 + adc QH, 0 + mov [Q7 + 16], Q0 + mov Q9, QH + + add Q6, 24 + add Q7, 24 + dec D10 + jnz SymCryptFdef369RawMulAsmLoop2 + + mov [Q7], QH // write next word. (stays within Dst buffer) + + dec D2 + jnz SymCryptFdef369RawMulAsmLoopOuter + +MUL_FUNCTION_END(SymCryptFdef369RawMulAsm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdef369MontgomeryReduceAsm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _Inout_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdef369MontgomeryReduceAsm, 3, 13) + + mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits + inc D4 + mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + lea D12, [D4 + 2*D4] // outer loop counter, in words + + xor D8, D8 + + // General register allocations + // Q0 = multiply result + // QH = multiply result + // Q1 = pointer to modulus value + // Q2 = pSrc (updated in outer loop) + // Q3 = pDst + // D4 = nDigits + // Q5 = pmMod->tm.montgomery.inv64 + // Q6 = multiplier in inner loop + // Q7 = carry + // Q8 = carry out from last word of previous loop iteration + // Q9 = running pointer in Src + // Q10 = running pointer in Mod + // D11 = loop counter + // D12 = outer loop counter (words) + +ALIGN(16) + +SymCryptFdef369MontgomeryReduceAsmOuterLoop: + + // start decoder with a few simple instructions, including at least one that requires + // a uop execution and is on the critical path + + mov Q6, [Q2] // fetch word of Src we want to set to zero + mov Q10, Q2 + mov Q9, Q1 + + imul Q6, Q5 // lower word is same for signed & unsigned multiply + + mov D11, D4 + xor D7, D7 + +ALIGN(16) + +SymCryptFdef369MontgomeryReduceAsmInnerloop: + // Q0 = mul scratch + // QH = mul scratch + // Q1 = pointer to modulus value + // Q6 = multiplier + // Q7 = carry (64 bits) + // Q9 = running ptr to modulus + // Q10 = running ptr to input/scratch + // D11 = inner loop counter (digits) + // D12 = outer loop counter (words) + + mov Q0, [Q9] + mul Q6 + add Q0, [Q10] + adc QH, 0 + add Q0, Q7 + adc QH, 0 + mov [Q10], Q0 + mov Q7, QH + + mov Q0, [Q9 + 8] + mul Q6 + add Q0, [Q10 + 8] + adc QH, 0 + add Q0, Q7 + adc QH, 0 + mov [Q10 + 8], Q0 + mov Q7, QH + + mov Q0, [Q9 + 16] + mul Q6 + add Q0, [Q10 + 16] + adc QH, 0 + add Q0, Q7 + adc QH, 0 + mov [Q10 + 16], Q0 + mov Q7, QH + + add Q9, 24 + add Q10, 24 + dec D11 + jnz SymCryptFdef369MontgomeryReduceAsmInnerloop + + add Q7, Q8 + mov D8, 0 + adc Q8, 0 + add Q7, [Q10] + adc Q8, 0 + mov [Q10], Q7 + + add Q2, 8 + + dec D12 + jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop + + // + // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result + // + + // First we compute the pSrc result minus the modulus into the destination + mov D11, D4 // loop ctr + mov Q10, Q2 // pSrc + mov Q9, Q1 // pMod + mov Q7, Q3 // pDst + + // Cy = 0 because the last 'adc Q8,0' resulted in 0, 1, or 2 + +ALIGN(16) + +SymCryptFdef369MontgomeryReduceAsmSubLoop: + mov Q0,[Q10] + sbb Q0,[Q9] + mov [Q7], Q0 + + mov Q0,[Q10 + 8] + sbb Q0,[Q9 + 8] + mov [Q7 + 8], Q0 + + mov Q0,[Q10 + 16] + sbb Q0,[Q9 + 16] + mov [Q7 + 16], Q0 + + lea Q10,[Q10 + 24] + lea Q9,[Q9 + 24] + lea Q7,[Q7 + 24] + + dec D11 + jnz SymCryptFdef369MontgomeryReduceAsmSubLoop + + // Finally a masked copy form pSrc to pDst + // copy if: Q8 == 0 && Cy = 1 + sbb Q8, 0 // mask (64 bits) + +ALIGN(16) + +SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop: + mov Q0, [Q2] + mov Q1, [Q3] + xor Q0, Q1 + and Q0, Q8 + xor Q0, Q1 + mov [Q3], Q0 + + mov Q0, [Q2 + 8] + mov Q1, [Q3 + 8] + xor Q0, Q1 + and Q0, Q8 + xor Q0, Q1 + mov [Q3 + 8], Q0 + + mov Q0, [Q2 + 16] + mov Q1, [Q3 + 16] + xor Q0, Q1 + and Q0, Q8 + xor Q0, Q1 + mov [Q3 + 16], Q0 + + // Move on to the next digit + + add Q2, 24 + add Q3, 24 + dec D4 + jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop + +MUL_FUNCTION_END(SymCryptFdef369MontgomeryReduceAsm) + +FILE_END() diff --git a/lib/amd64/fdef_asm.asm b/lib/amd64/fdef_asm.asm deleted file mode 100644 index 8f53bf7..0000000 --- a/lib/amd64/fdef_asm.asm +++ /dev/null @@ -1,2165 +0,0 @@ -; -; fdef_asm.asm Assembler code for large integer arithmetic in the default data format -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -include ksamd64.inc - -include symcrypt_version.inc -include symcrypt_magic.inc - - - -include C_asm_shared.inc - -include fdef_mul_macros.asm - - altentry SymCryptFdefMontgomerReduce256AsmInternal - - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdefRawAdd( -; _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, -; _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, -; UINT32 nDigits, -; UINT32 mask ); - - LEAF_ENTRY SymCryptFdefRawAddAsm, _TEXT - - ; rcx = Src1 - ; rdx = Src2 - ; r8 = Dst - ; r9 = nDigits - - add r9d, r9d ; loop over each half digit - xor rax, rax - xor r10, r10 - -SymCryptFdefRawAddAsmLoop: - ; carry is in the carry flag - mov rax,[rcx] - adc rax,[rdx] - mov [r8],rax - - mov rax,[rcx + 8] - adc rax,[rdx + 8] - mov [r8 + 8], rax - - mov rax,[rcx + 16] - adc rax,[rdx + 16] - mov [r8 + 16], rax - - mov rax,[rcx + 24] - adc rax,[rdx + 24] - mov [r8 + 24], rax - - lea rcx, [rcx + 32] - lea rdx, [rdx + 32] - lea r8, [r8 + 32] - dec r9d - jnz SymCryptFdefRawAddAsmLoop - - mov rax, r10 - adc rax, r10 - - ret - - LEAF_END SymCryptFdefRawAddAsm, _TEXT - - -;UINT32 -;SYMCRYPT_CALL -;SymCryptFdefRawSub( -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, -; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, -; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, -; UINT32 nDigits ) - - LEAF_ENTRY SymCryptFdefRawSubAsm, _TEXT - - ; rcx = Src1 - ; rdx = Src2 - ; r8 = Dst - ; r9 = nDigits - - add r9d, r9d ; loop over each half digit - xor rax, rax - xor r10, r10 - -SymCryptFdefRawSubAsmLoop: - ; carry is in the carry flag - mov rax,[rcx] - sbb rax,[rdx] - mov [r8],rax - - mov rax,[rcx + 8] - sbb rax,[rdx + 8] - mov [r8 + 8], rax - - mov rax,[rcx + 16] - sbb rax,[rdx + 16] - mov [r8 + 16], rax - - mov rax,[rcx + 24] - sbb rax,[rdx + 24] - mov [r8 + 24], rax - - lea rcx, [rcx + 32] - lea rdx, [rdx + 32] - lea r8, [r8 + 32] - dec r9d - jnz SymCryptFdefRawSubAsmLoop - - mov rax, r10 - adc rax, r10 - - ret - - LEAF_END SymCryptFdefRawSubAsm, _TEXT - - - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefMaskedCopy( -; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, -; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, -; UINT32 nDigits, -; UINT32 mask ) - - LEAF_ENTRY SymCryptFdefMaskedCopyAsm, _TEXT - - add r8d, r8d ; loop over half digits - - movd xmm0, r9d ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - -SymCryptFdefMaskedCopyAsmLoop: - movdqa xmm2, [rcx] ; xmm2 = pSrc[0] - movdqa xmm3, [rdx] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [rdx], xmm2 - - movdqa xmm2, [rcx + 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rdx + 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [rdx + 16], xmm2 - - ; Move on to the next digit - - add rcx, 32 - add rdx, 32 - sub r8d, 1 - jnz SymCryptFdefMaskedCopyAsmLoop - ret - - LEAF_END SymCryptFdefMaskedCopyAsm, _TEXT - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; UINT32 nDigits1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits2, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) - -SymCryptFdefRawMulAsm_Frame struct - SavedRdi dq ? - SavedRsi dq ? - SavedR15 dq ? - SavedR14 dq ? - SavedR13 dq ? - SavedR12 dq ? - SavedRbx dq ? - returnaddress dq ? - Arg1Home dq ? - Arg2Home dq ? - Arg3Home dq ? - Arg4Home dq ? - pDst dq ? - -SymCryptFdefRawMulAsm_Frame ends - - NESTED_ENTRY SymCryptFdefRawMulAsm, _TEXT - - rex_push_reg rbx - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rsi - push_reg rdi - - END_PROLOGUE - - ; Basic structure: - ; for each word in Src1: - ; Dst += Src2 * word - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src1 to multiply with - ; rcx = pSrc1 (updated in outer loop) - ; rdx = tmp for mul - ; rsi = inner loop pointer into pSrc2 - ; rdi = inner loop pointer into pDst - ; r8 = pSrc2 - ; r9 = nDigits2 - ; r10 = pDst (incremented in outer loop) - ; r11 = # words left from Src1 to process - ; r12 = carry for even words (64 bits) - ; r13 = inner loop counter - ; r15 = carry for odd words (64 bits) - - mov r11, rdx ; nDigits1 - shl r11, 3 ; nDigits1 * 8 = # words in Src1 to process - mov r10, [rsp + SymCryptFdefRawMulAsm_Frame.pDst ] - - ; Outer loop invariant established: rcx, r8, r9, r10 - - - mov rsi, r8 ; rsi = pSrc2 - mov rdi, r10 ; rdi = pDst + outer loop ctr - mov rbx, [rcx] ; mulword - xor r12, r12 - mov r13, r9 - - ; First inner loop overwrites Dst, which avoids adding the current Dst value - - ALIGN 16 - -SymCryptFdefRawMulAsmLoop1: - MULT_SINGLEADD_128 0, rsi, rdi - MULT_SINGLEADD_128 2, rsi, rdi - MULT_SINGLEADD_128 4, rsi, rdi - MULT_SINGLEADD_128 6, rsi, rdi - - lea rsi,[rsi + 64] - lea rdi,[rdi + 64] - - dec r13 - jnz SymCryptFdefRawMulAsmLoop1 - - mov [rdi], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 - - ALIGN 16 - -SymCryptFdefRawMulAsmLoopOuter: - - add rcx, 8 ; move to next word of pSrc1 - add r10, 8 ; move Dst pointer one word over - mov rbx, [rcx] - mov rsi, r8 - mov rdi, r10 - xor r12, r12 - mov r13, r9 - - ALIGN 16 - -SymCryptFdefRawMulAsmLoop2: - MULT_DOUBLEADD_128 0, rsi, rdi - MULT_DOUBLEADD_128 2, rsi, rdi - MULT_DOUBLEADD_128 4, rsi, rdi - MULT_DOUBLEADD_128 6, rsi, rdi - - lea rsi,[rsi + 64] - lea rdi,[rdi + 64] - - dec r13 - jnz SymCryptFdefRawMulAsmLoop2 - - mov [rdi], r12 ; write next word. (stays within Dst buffer) - - sub r11, 1 - jnz SymCryptFdefRawMulAsmLoopOuter - - BEGIN_EPILOGUE - - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - ret - - NESTED_END SymCryptFdefRawMulAsm, _TEXT - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquareAsm( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) - -SymCryptFdefRawSquareAsm_Frame struct - - SavedRcx dq ? - SavedRdi dq ? - SavedRsi dq ? - SavedR15 dq ? - SavedR14 dq ? - SavedR13 dq ? - SavedR12 dq ? - SavedRbx dq ? - returnaddress dq ? - Arg1Home dq ? - Arg2Home dq ? - Arg3Home dq ? - -SymCryptFdefRawSquareAsm_Frame ends - - NESTED_ENTRY SymCryptFdefRawSquareAsm, _TEXT - - rex_push_reg rbx - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rsi - push_reg rdi - push_reg rcx - - END_PROLOGUE - - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src to multiply with - ; rcx = outer loop pointer into pSrc - ; rdx = tmp for mul - ; rsi = inner loop pointer into pSrc - ; rdi = inner loop pointer into pDst - ; r8 = pDst (constant) - ; r9 = nDigits (constant) - ; r10 = outer loop pointer into pDst - ; r11 = outer loop counter of #words left - ; r12 = carry for even words (64 bits) - ; r13 = inner loop counter of #words left - ; r14 = cyclic counter that specifies on which branch we jump into - ; r15 = carry for odd words (64 bits) - - mov r9, rdx ; nDigits - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; First Pass - Addition of the cross products x_i*x_j with i!=j - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; - ; At the beginning of each inner loop we will jump over the - ; words that don't need processing. The decision of the jump - ; will be based on the cyclic counter r14. - ; - ; For the first pass we loop over **half** digits since having a smaller - ; number of jumps (i.e. 4) is actually faster than having 8 jumps. - ; - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov r11, rdx ; nDigits - shl r11, 3 ; r11 = outer #words - mov r10, r8 ; r10 = outer pDst - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - ; Initial inner loop overwrites Dst, which avoids adding the current Dst value - - mov rbx, [rcx] ; mulword - - xor r12, r12 ; carry = 0 - xor r15, r15 ; carry = 0 - - mov r13, r11 ; r13 = inner #words - mov [rdi], r12 ; Write 0 in the first word - - ; Skip over the first word - jmp SymCryptFdefRawSquareAsmInnerLoopInit_Word1 - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoopInit_Word0: - SQR_SINGLEADD_64 0, rsi, rdi, r12, r15 - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoopInit_Word1: - SQR_SINGLEADD_64 1, rsi, rdi, r15, r12 - - SQR_SINGLEADD_64 2, rsi, rdi, r12, r15 - - SQR_SINGLEADD_64 3, rsi, rdi, r15, r12 - - lea rsi, [rsi + 32] - lea rdi, [rdi + 32] - sub r13, 4 - jnz SymCryptFdefRawSquareAsmInnerLoopInit_Word0 - - mov [rdi], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 ; Counter for the outer loop - mov r14, 1 ; Cyclic counter r14 = 1 - - ALIGN 16 -SymCryptFdefRawSquareAsmLoopOuter: - - add r10, 8 ; move Dst pointer 1 word over - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - mov rbx, [rcx + 8*r14] ; Get the next mulword - - inc r14b ; Increment the cyclic counter by 1 - - mov r13, r11 ; # of words for the inner loop - add r13, 2 - and r13, 0FFFFFFFFFFFFFFFCh ; Zero out the 2 lower bits - - xor r12, r12 ; carry = 0 - xor r15, r15 ; carry = 0 - - ; Logic to find the correct jump - cmp r14b, 3 - je SymCryptFdefRawSquareAsmInnerLoop_Word3 - cmp r14b, 2 - je SymCryptFdefRawSquareAsmInnerLoop_Word2 - cmp r14b, 1 - je SymCryptFdefRawSquareAsmInnerLoop_Word1 - - ; The following instructions are only executed when r14b == 4 - xor r14b, r14b ; Set it to 0 for the next iteration - - add rcx, 32 ; move pSrc 4 words over - add r10, 32 ; move destination 4 words over - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoop_Word0: - SQR_DOUBLEADD_64 0, rsi, rdi, r12, r15 - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoop_Word1: - SQR_DOUBLEADD_64 1, rsi, rdi, r15, r12 - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoop_Word2: - SQR_DOUBLEADD_64 2, rsi, rdi, r12, r15 - - ALIGN 16 -SymCryptFdefRawSquareAsmInnerLoop_Word3: - SQR_DOUBLEADD_64 3, rsi, rdi, r15, r12 - - lea rsi, [rsi + 32] - lea rdi, [rdi + 32] - sub r13, 4 - jnz SymCryptFdefRawSquareAsmInnerLoop_Word0 - - mov [rdi], r12 ; write next word. (stays within Dst buffer) - - dec r11 - cmp r11, 1 - jne SymCryptFdefRawSquareAsmLoopOuter - - xor rdx, rdx - mov [r10 + 40], rdx ; Final word = 0 - - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Second Pass - Shifting all results 1 bit left - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov r11, r9 ; nDigits - mov rdi, r8 ; pDst pointer - shl r11, 1 ; 2*nDigits - - ALIGN 16 -SymCryptFdefRawSquareAsmSecondPass: - SQR_SHIFT_LEFT 0 - SQR_SHIFT_LEFT 1 - SQR_SHIFT_LEFT 2 - SQR_SHIFT_LEFT 3 - - SQR_SHIFT_LEFT 4 - SQR_SHIFT_LEFT 5 - SQR_SHIFT_LEFT 6 - SQR_SHIFT_LEFT 7 - - lea rdi, [rdi + 64] - dec r11 - jnz SymCryptFdefRawSquareAsmSecondPass - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Third Pass - Adding the squares on the even columns and propagating the sum - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov rsi, [rsp + SymCryptFdefRawSquareAsm_Frame.SavedRcx] - mov rdi, r8 ; rdi = pDst - - xor r12, r12 - -SymCryptFdefRawSquareAsmThirdPass: - SQR_DIAGONAL_PROP 0 - SQR_DIAGONAL_PROP 1 - SQR_DIAGONAL_PROP 2 - SQR_DIAGONAL_PROP 3 - SQR_DIAGONAL_PROP 4 - SQR_DIAGONAL_PROP 5 - SQR_DIAGONAL_PROP 6 - SQR_DIAGONAL_PROP 7 - - add rsi, 64 ; One digit up - add rdi, 128 ; Two digits up - sub r9, 1 - jnz SymCryptFdefRawSquareAsmThirdPass - - BEGIN_EPILOGUE - - pop rcx - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - ret - - NESTED_END SymCryptFdefRawSquareAsm, _TEXT - - -;VOID -;SymCryptFdefMontgomeryReduceAsm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefMontgomeryReduceAsm, _TEXT - - rex_push_reg rbx - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rsi - push_reg rdi - push_reg rbp - - END_PROLOGUE - - mov r11, rdx ; r11 = pSrc - mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits - mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - mov edi, ebp ; outer loop counter - shl edi, 3 ; edi is in words - - xor r14d, r14d - - ; General register allocations - ; rax = multiply result - ; rbx = multiplier in inner loop - ; rcx = pointer to modulus value - ; rdx = multiply result - ; rsi = loop counter - ; rdi = loop counter - ; rbp = nDigits - ; r8 = pDst - ; r9 = running pointer in Src - ; r10 = running pointer in Mod - ; r11 = pSrc (updated in outer loop) - ; r12 = carry - ; r13 = pmMod->tm.montgomery.inv64 - ; r14 = carry out from last word of previous loop iteration - - ALIGN 16 - -SymCryptFdefMontgomeryReduceAsmOuterLoop: - - ; start decoder with a few simple instructions, including at least one that requires - ; a uop execution and is on the critical path - - mov rbx, [r11] ; fetch word of Src we want to set to zero - mov r10, r11 - mov r9, rcx - - imul rbx, r13 ; lower word is same for signed & unsigned multiply - - mov esi, ebp - xor r12d, r12d - - ALIGN 16 - -SymCryptFdefMontgomeryReduceAsmInnerloop: - ; rax = mul scratch - ; rbx = multiplier - ; rcx = pointer to modulus value - ; rdx = mul scratch - ; edi = outer loop counter (words) - ; esi = inner loop counter (digits) - ; r9 = running ptr to modulus - ; r10 = running ptr to input/scratch - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - MULT_DOUBLEADD_128 0, r9, r10 - MULT_DOUBLEADD_128 2, r9, r10 - MULT_DOUBLEADD_128 4, r9, r10 - MULT_DOUBLEADD_128 6, r9, r10 - - lea r9,[r9 + 64] - lea r10,[r10 + 64] - - dec esi - jnz SymCryptFdefMontgomeryReduceAsmInnerloop - - add r12, r14 - mov r14d, 0 - adc r14, 0 - add r12, [r10] - adc r14, 0 - mov [r10], r12 - - lea r11,[r11 + 8] - - dec edi - jnz SymCryptFdefMontgomeryReduceAsmOuterLoop - - ; - ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result - ; - - ; First we compute the pSrc result minus the modulus into the destination - mov esi, ebp ; loop ctr - mov r10, r11 ; pSrc - mov r9, rcx ; pMod - mov r12, r8 ; pDst - - ; Cy = 0 because the last 'sub edi,1' resulted in 0 - - ALIGN 16 - -SymCryptFdefMontgomeryReduceAsmSubLoop: - mov rax,[r10] - sbb rax,[r9] - mov [r12], rax - - mov rax,[r10 + 8] - sbb rax,[r9 + 8] - mov [r12 + 8], rax - - mov rax,[r10 + 16] - sbb rax,[r9 + 16] - mov [r12 + 16], rax - - mov rax,[r10 + 24] - sbb rax,[r9 + 24] - mov [r12 + 24], rax - - mov rax,[r10 + 32] - sbb rax,[r9 + 32] - mov [r12 + 32], rax - - mov rax,[r10 + 40] - sbb rax,[r9 + 40] - mov [r12 + 40], rax - - mov rax,[r10 + 48] - sbb rax,[r9 + 48] - mov [r12 + 48], rax - - mov rax,[r10 + 56] - sbb rax,[r9 + 56] - mov [r12 + 56], rax - - lea r10,[r10 + 64] - lea r9,[r9 + 64] - lea r12,[r12 + 64] - - dec esi - jnz SymCryptFdefMontgomeryReduceAsmSubLoop - - ; Finally a masked copy form pSrc to pDst - ; copy if: r14 == 0 && Cy = 1 - sbb r14d, 0 - - movd xmm0, r14d ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - - ALIGN 16 - -SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop: - movdqa xmm2, [r11] ; xmm2 = pSrc[0] - movdqa xmm3, [r8] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8], xmm2 - - movdqa xmm2, [r11 + 16] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 16], xmm2 - - movdqa xmm2, [r11 + 32] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 32] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 32], xmm2 - - movdqa xmm2, [r11 + 48] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 48] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 48], xmm2 - - ; Move on to the next digit - lea r11,[r11 + 64] - lea r8,[r8 + 64] - - dec ebp - jnz SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop - - BEGIN_EPILOGUE - - pop rbp - pop rdi - pop rsi - pop r15 - pop r14 - pop r13 - pop r12 - pop rbx - ret - - NESTED_END SymCryptFdefMontgomeryReduceAsm, _TEXT - - -; -------------------------------- -; 256-bit size specific functions -; -------------------------------- - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefModAdd256( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PCSYMCRYPT_MODELEMENT peSrc1, -; _In_ PCSYMCRYPT_MODELEMENT peSrc2, -; _Out_ PSYMCRYPT_MODELEMENT peDst, -; _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, -; SIZE_T cbScratch ); - - NESTED_ENTRY SymCryptFdefModAdd256Asm, _TEXT - - push_reg r12 - push_reg r13 - push_reg r14 - push_reg rbx - - END_PROLOGUE - - ; rcx = pmMod - ; rdx = peSrc1 - ; r8 = peSrc2 - ; r9 = peDst - - ; compute Src1 + Src2 into (rax, rbx, r10, r11) with carry out mask in r12 - - mov rax, [rdx] - add rax, [r8 ] - mov rbx, [rdx + 8] - adc rbx, [r8 + 8] - mov r10, [rdx + 16] - adc r10, [r8 + 16] - mov r11, [rdx + 24] - adc r11, [r8 + 24] - sbb r12, r12 ; r12 = carry out mask - - ; rdx, r8: free - - ; Compute sum - Mod into (rdx, r8, r13, r14) = sum - modulus, rcx = carry out mask - - add rcx, SymCryptModulusValueOffsetAmd64 - - mov rdx, rax - sub rdx, [rcx] - mov r8, rbx - sbb r8, [rcx + 8] - mov r13, r10 - sbb r13, [rcx + 16] - mov r14, r11 - sbb r14, [rcx + 24] - - sbb rcx, rcx ; rcx = carry out mask - - ; Choose between the two - ; addition carry = 1, then subtraction carry = 1 and we pick the 2nd result. - ; addition carry = 0 and subtraction carry = 0: pick 2nd result - ; addition carry = 0 and subtraction carry = 1: pick first result - - xor rcx, r12 ; 0 = 2nd result, 1 = first result - - xor rax, rdx - xor rbx, r8 - xor r10, r13 - xor r11, r14 - - and rax, rcx - and rbx, rcx - and r10, rcx - and r11, rcx - - xor rdx, rax - xor r8 , rbx - xor r13, r10 - xor r14, r11 - - mov [r9 + 0], rdx - mov [r9 + 8], r8 - mov [r9 + 16], r13 - mov [r9 + 24], r14 - - BEGIN_EPILOGUE - - pop rbx - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefModAdd256Asm, _TEXT - - - - NESTED_ENTRY SymCryptFdefModSub256Asm, _TEXT - - push_reg r12 - push_reg r13 - push_reg rbx - - END_PROLOGUE - - ; rcx = pmMod - ; rdx = peSrc1 - ; r8 = peSrc2 - ; r9 = peDst - - ; compute Src1 - Src2 into (rax, rbx, r10, r11) with carry out mask in r12 - - mov rax, [rdx] - sub rax, [r8 ] - mov rbx, [rdx + 8] - sbb rbx, [r8 + 8] - mov r10, [rdx + 16] - sbb r10, [r8 + 16] - mov r11, [rdx + 24] - sbb r11, [r8 + 24] - sbb r12, r12 ; r12 = carry out mask - - ; rdx, r8: free - - ; Load Mod into into (rdx, r8, r13, rcx) - - add rcx, SymCryptModulusValueOffsetAmd64 - - mov rdx, [rcx] - mov r8, [rcx + 8] - mov r13, [rcx + 16] - mov rcx, [rcx + 24] - - ; Mask the value to be added to zero if there was no underflow - and rdx, r12 - and r8 , r12 - and r13, r12 - and rcx, r12 - - ; Add the (masked) modulus - add rax, rdx - adc rbx, r8 - adc r10, r13 - adc r11, rcx - - mov [r9 + 0], rax - mov [r9 + 8], rbx - mov [r9 + 16], r10 - mov [r9 + 24], r11 - - BEGIN_EPILOGUE - - pop rbx - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefModSub256Asm, _TEXT - -;================================================= -; Multiplication -; - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefModMulMontgomery256Asm( -; _In_ PCSYMCRYPT_MODULUS pMod, -; _In_ PCSYMCRYPT_MODELEMENT pSrc1, -; _In_ PCSYMCRYPT_MODELEMENT pSrc2, -; _Out_ PSYMCRYPT_MODELEMENT pDst, -; _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, -; SIZE_T cbScratch ); - - NESTED_ENTRY SymCryptFdefModMulMontgomery256Asm, _TEXT - - MULT_COMMON_PROLOGUE ; saves all registers - - mov rsi, rdx ; we need rdx for the multiplication - - ; rcx = pMod - ; rsi = pSrc1 - ; r8 = pSrc2 - ; r9 = pDst - - ; First we compute the product. The result will be in 8 registers - ; rdi, rbp, r10, r11, r12, r13, r14, r15 - - mov rbx, [rsi] - xor r10, r10 - xor r11, r11 - xor r12, r12 - - mov rax, [r8] - mul rbx - mov rdi, rax - mov rbp, rdx - - mov rax, [r8 + 8] - mul rbx - add rbp, rax - adc r10, rdx - - mov rax, [r8 + 16] - mul rbx - add r10, rax - adc r11, rdx - - mov rax, [r8 + 24] - mul rbx - add r11, rax - adc r12, rdx - - ; Second row - mov rbx, [rsi + 8] - MUL14 rbx, r8, rbp, r10, r11, r12, r15 - mov r13, rdx - - ; third row - mov rbx, [rsi + 16] - MUL14 rbx, r8, r10, r11, r12, r13, r15 - mov r14, rdx - - ; fourth row - mov rbx, [rsi + 24] - MUL14 rbx, r8, r11, r12, r13, r14, r15 - mov r15, rdx - - - ALTERNATE_ENTRY SymCryptFdefMontgomerReduce256AsmInternal - ; Invariant: - ; common prologue used - ; 512-bit result in (rdi, rbp, r10, r11, r12, r13, r14, r15) - ; rcx = pmMod - ; r9 = peDst - - mov r8, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - add rcx, SymCryptModulusValueOffsetAmd64 - - mov rbx, rdi - imul rbx, r8 ; lower word is the same for signed & unsigned multiply; rbx = multiplicand for first row - MUL14 rbx, rcx, rdi, rbp, r10, r11, rdi - mov rdi, rdx ; Save the out carries in (eventually) (rdi, rbp, r10, r11) - - mov rbx, rbp - imul rbx, r8 - MUL14 rbx, rcx, rbp, r10, r11, r12, rbp - mov rbp, rdx ; Save the out carries in (eventually) (rdi, rbp, r10, r11) - - mov rbx, r10 - imul rbx, r8 - MUL14 rbx, rcx, r10, r11, r12, r13, r10 - mov r10, rdx - - mov rbx, r11 - imul rbx, r8 - MUL14 rbx, rcx, r11, r12, r13, r14, r11 - ; mov r11, rdx - - add r12, rdi - adc r13, rbp - adc r14, r10 - adc r15, rdx - - sbb rbx, rbx ; Carry out from final addition in mask form - - ; reduced value in (r12, r13, r14, r15, -rbx), and it is less than 2*Modulus - - mov rdi, r12 - sub rdi, [rcx] - mov rbp, r13 - sbb rbp, [rcx + 8] - mov r10, r14 - sbb r10, [rcx + 16] - mov r11, r15 - sbb r11, [rcx + 24] - - sbb rcx, rcx ; rcx = carry out mask - - ; Choose between the two - ; addition carry = 1, then subtraction carry = 1 and we pick the 2nd result. - ; addition carry = 0 and subtraction carry = 0: pick 2nd result - ; addition carry = 0 and subtraction carry = 1: pick first result - - xor rcx, rbx ; 0 = 2nd result, 1 = first result - - xor r12, rdi - xor r13, rbp - xor r14, r10 - xor r15, r11 - - and r12, rcx - and r13, rcx - and r14, rcx - and r15, rcx - - xor rdi, r12 - xor rbp, r13 - xor r10, r14 - xor r11, r15 - - mov [r9 + 0], rdi - mov [r9 + 8], rbp - mov [r9 + 16], r10 - mov [r9 + 24], r11 - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefModMulMontgomery256Asm, _TEXT - - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefMontgomeryReduce256Asm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ); - - NESTED_ENTRY SymCryptFdefMontgomeryReduce256Asm, _TEXT - - MULT_COMMON_PROLOGUE ; saves all registers - - mov r9, r8 - mov rdi, [rdx + 0] - mov rbp, [rdx + 8] - mov r10, [rdx + 16] - mov r11, [rdx + 24] - mov r12, [rdx + 32] - mov r13, [rdx + 40] - mov r14, [rdx + 48] - mov r15, [rdx + 56] - - - ; Normal code doesn't jump from the body of one function to the body of another function. - ; Here we have ensured that our stack frames are identical, so it is safe. - ; We just have to convince the other system components that this works... - - ; Use conditional jump so that stack unwinder doesn't think it is an epilogue - test rsp,rsp - jne SymCryptFdefMontgomerReduce256AsmInternal ; jumps always - - int 3 ; Dummy instruction because the debugger seems to have an off-by-one - ; error and still see the (wrong) epilogue when on the JNE instruction - ; Best guess: the debugger starts the stack trace *after* the current instruction - - ; And then we need a dummy epilogue to keep the assembler happy - BEGIN_EPILOGUE - ret - - NESTED_END SymCryptFdefMontgomeryReduce256Asm, _TEXT - - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefModSquareMontgomery256( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PCSYMCRYPT_MODELEMENT peSrc, -; _Out_ PSYMCRYPT_MODELEMENT peDst, -; _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, -; SIZE_T cbScratch ) - - NESTED_ENTRY SymCryptFdefModSquareMontgomery256Asm, _TEXT - - MULT_COMMON_PROLOGUE - - - ; Result in rdi, rbp, r10, r11, r12, r13, r14, r15 - - mov rsi, rdx ; free up rdx for multiplication - mov r9, r8 ; need this later anyway - - ; rcx = pmMod - ; rsi = Src - ; r9 = pDst - - mov rbx, [rsi] - xor r11, r11 - xor r12, r12 - xor r13, r13 - xor r14, r14 - - ; First we compute all the terms that need doubling - - mov rax, [rsi + 8] - mul rbx - mov rbp, rax - mov r10, rdx - - mov rax, [rsi + 16] - mul rbx - add r10, rax - adc r11, rdx - - mov rax, [rsi + 24] - mul rbx - add r11, rax - adc r12, rdx - - mov rbx, [rsi + 8] - mov rax, [rsi + 16] - mul rbx - add r11, rax - adc rdx, 0 - mov r15, rdx - - mov rax, [rsi + 24] - mul rbx - add r12, rax - adc rdx, 0 - add r12, r15 - adc r13, rdx - - mov rbx, [rsi + 16] - mov rax, [rsi + 24] - mul rbx - add r13, rax - adc r14, rdx ; no overflow from this - - ; double these terms - xor r15, r15 - - add rbp, rbp - adc r10, r10 - adc r11, r11 - adc r12, r12 - adc r13, r13 - adc r14, r14 - adc r15, 0 - - mov rax, [rsi] - mul rax - mov rdi, rax - mov rbx, rdx - - mov rax, [rsi + 8] - mul rax - - add rbp, rbx - adc r10, rax - adc r11, rdx - sbb r8, r8 ; -carry - - mov rax, [rsi + 16] - mul rax - - add r8, r8 - adc r12, rax - adc r13, rdx - sbb r8, r8 - - mov rax, [rsi + 24] - mul rax - add r8, r8 - adc r14, rax - adc r15, rdx - - ; See SymCryptFdefMontgomeryReduce256Asm for a discussion of this strange epilogue sequence - test rsp,rsp - jne SymCryptFdefMontgomerReduce256AsmInternal ; jumps always - - int 3 - - BEGIN_EPILOGUE - ret - - NESTED_END SymCryptFdefModSquareMontgomery256Asm, _TEXT - -; -------------------------------- -; 512-bit size specific functions -; -------------------------------- - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul512Asm( -; _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, -; _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ); - NESTED_ENTRY SymCryptFdefRawMul512Asm, _TEXT - - MULT_COMMON_PROLOGUE ; saves all registers - - ; Basic structure: - ; for each word in Src1: - ; Dst += Src2 * word - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src1 to multiply with - ; rcx = pSrc1 (updated in outer loop) - ; rdx = tmp for mul - ; rsi = pSrc2 (constant) - ; rdi = pDst (incremented in outer loop) - ; r8 = nDigits (constant) - ; r9 = pDst (constant) - ; r11 = # words left from Src1 to process - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - mov r11, r8 ; nDigits - shl r11, 3 ; nDigits * 8 = # words in Src1 to process - - mov rsi, rdx ; rsi = pSrc2 - mov rdi, r9 ; rdi = pDst - mov rbx, [rcx] ; mulword - - xor r12, r12 ; carry - - ; First inner loop overwrites Dst, which avoids adding the current Dst value - MULT_SINGLEADD_128 0, rsi, rdi - MULT_SINGLEADD_128 2, rsi, rdi - MULT_SINGLEADD_128 4, rsi, rdi - MULT_SINGLEADD_128 6, rsi, rdi - - mov [rdi + 64], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 - - ALIGN 16 - -SymCryptFdefRawMul512AsmLoopOuter: - - lea rcx, [rcx + 8] ; move to next word of pSrc1 - lea rdi, [rdi + 8] ; move Dst pointer one word over - - mov rbx, [rcx] ; mulword - - xor r12, r12 ; carry - - MULT_DOUBLEADD_128 0, rsi, rdi - MULT_DOUBLEADD_128 2, rsi, rdi - MULT_DOUBLEADD_128 4, rsi, rdi - MULT_DOUBLEADD_128 6, rsi, rdi - - mov [rdi + 64], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 - jnz SymCryptFdefRawMul512AsmLoopOuter - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefRawMul512Asm, _TEXT - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquareAsm( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) - NESTED_ENTRY SymCryptFdefRawSquare512Asm, _TEXT - - MULT_COMMON_PROLOGUE - - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src to multiply with - ; rcx = outer loop pointer into pSrc - ; rdx = tmp for mul - ; rsi = inner loop pointer into pSrc - ; rdi = inner loop pointer into pDst - ; r8 = pDst (constant) - ; r9 = nDigits (constant) - ; r10 = outer loop pointer into pDst - ; r11 = outer loop counter of #words left - ; r12 = carry for even words (64 bits) - ; r13 = inner loop counter of #words left - ; r14 = pSrc (constant) - ; r15 = carry for odd words (64 bits) - - mov r9, rdx ; nDigits - mov r14, rcx ; saving pSrc - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; First Pass - Addition of the cross products x_i*x_j with i!=j - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov r11, rdx ; nDigits - shl r11, 3 ; r11 = outer #words - mov r10, r8 ; r10 = outer pDst - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - ; Initial inner loop overwrites Dst, which avoids adding the current Dst value - ; 7 iterations - xor r15, r15 ; carry = 0 (for "odd" iterations set only the r15 carry) - mov rbx, [rcx] ; mulword - mov [rdi], r15 ; Write 0 in the first word - - SQR_SINGLEADD_64 1, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 2, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 3, rsi, rdi, r15, r12 - - SQR_SINGLEADD_64 4, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 5, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 6, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 7, rsi, rdi, r15, r12 - - mov [rdi + 8*8], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - add r10, 8 ; Skip over the first word - - ; 6 iterations - xor r12, r12 ; carry = 0 (for "even" iterations set only the r12 carry) - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - SQR_DOUBLEADD_64_4 2 - mov [rdi + 6*8], r12 - - ; 5 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 ; Notice the dst_carry is r12 since all the "double" macros have r12 as src_carry - SQR_DOUBLEADD_64_4 1 - mov [rdi + 5*8], r12 - - ; 4 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_4 0 - mov [rdi + 4*8], r12 - - ; 3 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_2 1 - mov [rdi + 3*8], r12 - - ; 2 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - mov [rdi + 2*8], r12 - - ; 1 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - mov [rdi + 8], r12 - - xor rdx, rdx - mov [rdi + 16], rdx ; Final word = 0 - - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Second Pass - Shifting all results 1 bit left - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov r11, r9 ; nDigits - mov rdi, r8 ; pDst pointer - shl r11, 1 ; 2*nDigits - - ALIGN 16 -SymCryptFdefRawSquare512AsmSecondPass: - SQR_SHIFT_LEFT 0 - SQR_SHIFT_LEFT 1 - SQR_SHIFT_LEFT 2 - SQR_SHIFT_LEFT 3 - - SQR_SHIFT_LEFT 4 - SQR_SHIFT_LEFT 5 - SQR_SHIFT_LEFT 6 - SQR_SHIFT_LEFT 7 - - lea rdi, [rdi + 64] - dec r11 - jnz SymCryptFdefRawSquare512AsmSecondPass - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Third Pass - Adding the squares on the even columns and propagating the sum - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov rsi, r14 ; rsi = pSrc - mov rdi, r8 ; rdi = pDst - - xor r12, r12 - - SQR_DIAGONAL_PROP 0 - SQR_DIAGONAL_PROP 1 - SQR_DIAGONAL_PROP 2 - SQR_DIAGONAL_PROP 3 - SQR_DIAGONAL_PROP 4 - SQR_DIAGONAL_PROP 5 - SQR_DIAGONAL_PROP 6 - SQR_DIAGONAL_PROP 7 - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefRawSquare512Asm, _TEXT - -;VOID -;SymCryptFdefMontgomeryReduce512Asm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefMontgomeryReduce512Asm, _TEXT - - MULT_COMMON_PROLOGUE - - mov r11, rdx ; r11 = pSrc - mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits - mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - mov edi, ebp ; outer loop counter - shl edi, 3 ; edi is in words - - xor r14d, r14d - - ; General register allocations - ; rax = multiply result - ; rbx = multiplier in inner loop - ; rcx = pointer to modulus value - ; rdx = multiply result - ; rsi = loop counter - ; rdi = loop counter - ; rbp = nDigits - ; r8 = pDst - ; r9 = running pointer in Src - ; r10 = running pointer in Mod - ; r11 = pSrc (updated in outer loop) - ; r12 = carry - ; r13 = pmMod->tm.montgomery.inv64 - ; r14 = carry out from last word of previous loop iteration - - ALIGN 16 - -SymCryptFdefMontgomeryReduce512AsmOuterLoop: - - ; start decoder with a few simple instructions, including at least one that requires - ; a uop execution and is on the critical path - - mov rbx, [r11] ; fetch word of Src we want to set to zero - mov r10, r11 - mov r9, rcx - - imul rbx, r13 ; lower word is same for signed & unsigned multiply - - mov esi, ebp - xor r12d, r12d - - ; rax = mul scratch - ; rbx = multiplier - ; rcx = pointer to modulus value - ; rdx = mul scratch - ; edi = outer loop counter (words) - ; esi = inner loop counter (digits) - ; r9 = running ptr to modulus - ; r10 = running ptr to input/scratch - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - MULT_DOUBLEADD_128 0, r9, r10 - MULT_DOUBLEADD_128 2, r9, r10 - MULT_DOUBLEADD_128 4, r9, r10 - MULT_DOUBLEADD_128 6, r9, r10 - - lea r10,[r10 + 64] - - add r12, r14 - mov r14d, 0 - adc r14, 0 - add r12, [r10] - adc r14, 0 - mov [r10], r12 - - lea r11,[r11 + 8] - - dec edi - jnz SymCryptFdefMontgomeryReduce512AsmOuterLoop - - ; - ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result - ; - - ; First we compute the pSrc result minus the modulus into the destination - mov esi, ebp ; loop ctr - mov r10, r11 ; pSrc - mov r9, rcx ; pMod - mov r12, r8 ; pDst - - ; Cy = 0 because the last 'sub edi,1' resulted in 0 - mov rax,[r10] - sbb rax,[r9] - mov [r12], rax - - mov rax,[r10 + 8] - sbb rax,[r9 + 8] - mov [r12 + 8], rax - - mov rax,[r10 + 16] - sbb rax,[r9 + 16] - mov [r12 + 16], rax - - mov rax,[r10 + 24] - sbb rax,[r9 + 24] - mov [r12 + 24], rax - - mov rax,[r10 + 32] - sbb rax,[r9 + 32] - mov [r12 + 32], rax - - mov rax,[r10 + 40] - sbb rax,[r9 + 40] - mov [r12 + 40], rax - - mov rax,[r10 + 48] - sbb rax,[r9 + 48] - mov [r12 + 48], rax - - mov rax,[r10 + 56] - sbb rax,[r9 + 56] - mov [r12 + 56], rax - - lea r10,[r10 + 64] - lea r9,[r9 + 64] - lea r12,[r12 + 64] - - ; Finally a masked copy form pSrc to pDst - ; copy if: r14 == 0 && Cy = 1 - sbb r14d, 0 - - movd xmm0, r14d ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - - ALIGN 16 - -SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop: - movdqa xmm2, [r11] ; xmm2 = pSrc[0] - movdqa xmm3, [r8] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8], xmm2 - - movdqa xmm2, [r11 + 16] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 16], xmm2 - - movdqa xmm2, [r11 + 32] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 32] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 32], xmm2 - - movdqa xmm2, [r11 + 48] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 48] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 48], xmm2 - - ; Move on to the next digit - lea r11,[r11 + 64] - lea r8,[r8 + 64] - - dec ebp - jnz SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefMontgomeryReduce512Asm, _TEXT - - -; -------------------------------- -; 1024-bit size specific functions -; -------------------------------- - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul1024Asm( -; _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, -; _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ); - NESTED_ENTRY SymCryptFdefRawMul1024Asm, _TEXT - - MULT_COMMON_PROLOGUE ; saves all registers - - ; Basic structure: - ; for each word in Src1: - ; Dst += Src2 * word - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src1 to multiply with - ; rcx = pSrc1 (updated in outer loop) - ; rdx = tmp for mul - ; rsi = pSrc2 (constant) - ; rdi = pDst (incremented in outer loop) - ; r8 = nDigits (constant) - ; r9 = pDst (constant) - ; r11 = # words left from Src1 to process - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - mov r11, r8 ; nDigits - shl r11, 3 ; nDigits * 8 = # words in Src1 to process - - mov rsi, rdx ; rsi = pSrc2 - mov rdi, r9 ; rdi = pDst - mov rbx, [rcx] ; mulword - - xor r12, r12 ; carry - - ; First inner loop overwrites Dst, which avoids adding the current Dst value - MULT_SINGLEADD_128 0, rsi, rdi - MULT_SINGLEADD_128 2, rsi, rdi - MULT_SINGLEADD_128 4, rsi, rdi - MULT_SINGLEADD_128 6, rsi, rdi - - MULT_SINGLEADD_128 8, rsi, rdi - MULT_SINGLEADD_128 10, rsi, rdi - MULT_SINGLEADD_128 12, rsi, rdi - MULT_SINGLEADD_128 14, rsi, rdi - - mov [rdi + 128], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 - - ALIGN 16 - -SymCryptFdefRawMul1024AsmLoopOuter: - - lea rcx, [rcx + 8] ; move to next word of pSrc1 - lea rdi, [rdi + 8] ; move Dst pointer one word over - - mov rbx, [rcx] ; mulword - - xor r12, r12 ; carry - - MULT_DOUBLEADD_128 0, rsi, rdi - MULT_DOUBLEADD_128 2, rsi, rdi - MULT_DOUBLEADD_128 4, rsi, rdi - MULT_DOUBLEADD_128 6, rsi, rdi - - MULT_DOUBLEADD_128 8, rsi, rdi - MULT_DOUBLEADD_128 10, rsi, rdi - MULT_DOUBLEADD_128 12, rsi, rdi - MULT_DOUBLEADD_128 14, rsi, rdi - - mov [rdi + 128], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - - sub r11, 1 - jnz SymCryptFdefRawMul1024AsmLoopOuter - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefRawMul1024Asm, _TEXT - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquareAsm( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) - NESTED_ENTRY SymCryptFdefRawSquare1024Asm, _TEXT - - MULT_COMMON_PROLOGUE - - ; Register assignments - ; - ; rax = tmp for mul - ; rbx = word from Src to multiply with - ; rcx = outer loop pointer into pSrc - ; rdx = tmp for mul - ; rsi = inner loop pointer into pSrc - ; rdi = inner loop pointer into pDst - ; r8 = pDst (constant) - ; r9 = nDigits (constant) - ; r10 = outer loop pointer into pDst - ; r11 = outer loop counter of #words left - ; r12 = carry for even words (64 bits) - ; r13 = inner loop counter of #words left - ; r14 = pSrc (constant) - ; r15 = carry for odd words (64 bits) - - mov r9, rdx ; nDigits - mov r14, rcx ; saving pSrc - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; First Pass - Addition of the cross products x_i*x_j with i!=j - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov r11, rdx ; nDigits - shl r11, 3 ; r11 = outer #words - mov r10, r8 ; r10 = outer pDst - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - ; Initial inner loop overwrites Dst, which avoids adding the current Dst value - - ; 15 iterations - xor r15, r15 ; carry = 0 (for "odd" iterations set only the r15 carry) - mov rbx, [rcx] ; mulword - mov [rdi], r15 ; Write 0 in the first word - - SQR_SINGLEADD_64 1, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 2, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 3, rsi, rdi, r15, r12 - - SQR_SINGLEADD_64 4, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 5, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 6, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 7, rsi, rdi, r15, r12 - - SQR_SINGLEADD_64 8, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 9, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 10, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 11, rsi, rdi, r15, r12 - - SQR_SINGLEADD_64 12, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 13, rsi, rdi, r15, r12 - SQR_SINGLEADD_64 14, rsi, rdi, r12, r15 - SQR_SINGLEADD_64 15, rsi, rdi, r15, r12 - - mov [rdi + 16*8], r12 ; write last word, cannot overflow because Dst is at least 2 digits long - add r10, 8 ; Skip over the first word - - ; 14 iterations (adding the current Dst value) - xor r12, r12 ; carry = 0 (for "even" iterations set only the r12 carry) - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - SQR_DOUBLEADD_64_4 2 - SQR_DOUBLEADD_64_8 6 - mov [rdi + 14*8], r12 - - ; 13 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 ; Notice the dst_carry is r12 since all the "double" macros have r12 as src_carry - SQR_DOUBLEADD_64_4 1 - SQR_DOUBLEADD_64_8 5 - mov [rdi + 13*8], r12 - - ; 12 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_4 0 - SQR_DOUBLEADD_64_8 4 - mov [rdi + 12*8], r12 - - ; 11 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_2 1 - SQR_DOUBLEADD_64_8 3 - mov [rdi + 11*8], r12 - - ; 10 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - SQR_DOUBLEADD_64_8 2 - mov [rdi + 10*8], r12 - - - ; 9 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_8 1 - mov [rdi + 9*8], r12 - - ; 8 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_8 0 - mov [rdi + 8*8], r12 - - ; 7 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_2 1 - SQR_DOUBLEADD_64_4 3 - mov [rdi + 7*8], r12 - - ; 6 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - SQR_DOUBLEADD_64_4 2 - mov [rdi + 6*8], r12 - - ; 5 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_4 1 - mov [rdi + 5*8], r12 - - ; 4 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_4 0 - mov [rdi + 4*8], r12 - - ; 3 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - SQR_DOUBLEADD_64_2 1 - mov [rdi + 3*8], r12 - - ; 2 iterations - xor r12, r12 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64_2 0 - mov [rdi + 2*8], r12 - - ; 1 iterations - xor r15, r15 - SQR_SIZE_SPECIFIC_INIT - SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12 - mov [rdi + 8], r12 - - xor rdx, rdx - mov [rdi + 16], rdx ; Final word = 0 - - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Second Pass - Shifting all results 1 bit left - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - xor rax, rax ; carry flag = 0 - ; mov r11, r9 ; nDigits - mov rdi, r8 ; pDst pointer - ; shl r11, 1 ; 2*nDigits - - ; ALIGN 16 -; SymCryptFdefRawSquare1024AsmSecondPass: - SQR_SHIFT_LEFT 0 - SQR_SHIFT_LEFT 1 - SQR_SHIFT_LEFT 2 - SQR_SHIFT_LEFT 3 - - SQR_SHIFT_LEFT 4 - SQR_SHIFT_LEFT 5 - SQR_SHIFT_LEFT 6 - SQR_SHIFT_LEFT 7 - - SQR_SHIFT_LEFT 8 - SQR_SHIFT_LEFT 9 - SQR_SHIFT_LEFT 10 - SQR_SHIFT_LEFT 11 - - SQR_SHIFT_LEFT 12 - SQR_SHIFT_LEFT 13 - SQR_SHIFT_LEFT 14 - SQR_SHIFT_LEFT 15 - - SQR_SHIFT_LEFT 16 - SQR_SHIFT_LEFT 17 - SQR_SHIFT_LEFT 18 - SQR_SHIFT_LEFT 19 - - SQR_SHIFT_LEFT 20 - SQR_SHIFT_LEFT 21 - SQR_SHIFT_LEFT 22 - SQR_SHIFT_LEFT 23 - - SQR_SHIFT_LEFT 24 - SQR_SHIFT_LEFT 25 - SQR_SHIFT_LEFT 26 - SQR_SHIFT_LEFT 27 - - SQR_SHIFT_LEFT 28 - SQR_SHIFT_LEFT 29 - SQR_SHIFT_LEFT 30 - SQR_SHIFT_LEFT 31 - - ; lea rdi, [rdi + 64] - ; dec r11 - ; jnz SymCryptFdefRawSquare1024AsmSecondPass - - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ; Third Pass - Adding the squares on the even columns and propagating the sum - ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - mov rsi, r14 ; rsi = pSrc - mov rdi, r8 ; rdi = pDst - - xor r12, r12 - - SQR_DIAGONAL_PROP 0 - SQR_DIAGONAL_PROP 1 - SQR_DIAGONAL_PROP 2 - SQR_DIAGONAL_PROP 3 - SQR_DIAGONAL_PROP 4 - SQR_DIAGONAL_PROP 5 - SQR_DIAGONAL_PROP 6 - SQR_DIAGONAL_PROP 7 - - SQR_DIAGONAL_PROP 8 - SQR_DIAGONAL_PROP 9 - SQR_DIAGONAL_PROP 10 - SQR_DIAGONAL_PROP 11 - SQR_DIAGONAL_PROP 12 - SQR_DIAGONAL_PROP 13 - SQR_DIAGONAL_PROP 14 - SQR_DIAGONAL_PROP 15 - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefRawSquare1024Asm, _TEXT - -;VOID -;SymCryptFdefMontgomeryReduce1024Asm( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefMontgomeryReduce1024Asm, _TEXT - - MULT_COMMON_PROLOGUE - - mov r11, rdx ; r11 = pSrc - mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits - mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - mov edi, ebp ; outer loop counter - shl edi, 3 ; edi is in words - - xor r14d, r14d - - ; General register allocations - ; rax = multiply result - ; rbx = multiplier in inner loop - ; rcx = pointer to modulus value - ; rdx = multiply result - ; rsi = loop counter - ; rdi = loop counter - ; rbp = nDigits - ; r8 = pDst - ; r9 = running pointer in Src - ; r10 = running pointer in Mod - ; r11 = pSrc (updated in outer loop) - ; r12 = carry - ; r13 = pmMod->tm.montgomery.inv64 - ; r14 = carry out from last word of previous loop iteration - - ALIGN 16 - -SymCryptFdefMontgomeryReduce1024AsmOuterLoop: - - ; start decoder with a few simple instructions, including at least one that requires - ; a uop execution and is on the critical path - - mov rbx, [r11] ; fetch word of Src we want to set to zero - mov r10, r11 - mov r9, rcx - - imul rbx, r13 ; lower word is same for signed & unsigned multiply - - mov esi, ebp - xor r12d, r12d - - ; rax = mul scratch - ; rbx = multiplier - ; rcx = pointer to modulus value - ; rdx = mul scratch - ; edi = outer loop counter (words) - ; esi = inner loop counter (digits) - ; r9 = running ptr to modulus - ; r10 = running ptr to input/scratch - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - MULT_DOUBLEADD_128 0, r9, r10 - MULT_DOUBLEADD_128 2, r9, r10 - MULT_DOUBLEADD_128 4, r9, r10 - MULT_DOUBLEADD_128 6, r9, r10 - - MULT_DOUBLEADD_128 8, r9, r10 - MULT_DOUBLEADD_128 10, r9, r10 - MULT_DOUBLEADD_128 12, r9, r10 - MULT_DOUBLEADD_128 14, r9, r10 - - lea r10,[r10 + 128] - - add r12, r14 - mov r14d, 0 - adc r14, 0 - add r12, [r10] - adc r14, 0 - mov [r10], r12 - - lea r11,[r11 + 8] - - dec edi - jnz SymCryptFdefMontgomeryReduce1024AsmOuterLoop - - ; - ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result - ; - - ; First we compute the pSrc result minus the modulus into the destination - mov esi, ebp ; loop ctr - mov r10, r11 ; pSrc - mov r9, rcx ; pMod - mov r12, r8 ; pDst - - ; Cy = 0 because the last 'sub edi,1' resulted in 0 - - ALIGN 16 - -SymCryptFdefMontgomeryReduce1024AsmSubLoop: - mov rax,[r10] - sbb rax,[r9] - mov [r12], rax - - mov rax,[r10 + 8] - sbb rax,[r9 + 8] - mov [r12 + 8], rax - - mov rax,[r10 + 16] - sbb rax,[r9 + 16] - mov [r12 + 16], rax - - mov rax,[r10 + 24] - sbb rax,[r9 + 24] - mov [r12 + 24], rax - - mov rax,[r10 + 32] - sbb rax,[r9 + 32] - mov [r12 + 32], rax - - mov rax,[r10 + 40] - sbb rax,[r9 + 40] - mov [r12 + 40], rax - - mov rax,[r10 + 48] - sbb rax,[r9 + 48] - mov [r12 + 48], rax - - mov rax,[r10 + 56] - sbb rax,[r9 + 56] - mov [r12 + 56], rax - - lea r10,[r10 + 64] - lea r9,[r9 + 64] - lea r12,[r12 + 64] - - dec esi - jnz SymCryptFdefMontgomeryReduce1024AsmSubLoop - - ; Finally a masked copy form pSrc to pDst - ; copy if: r14 == 0 && Cy = 1 - sbb r14d, 0 - - movd xmm0, r14d ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - - ALIGN 16 - -SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop: - movdqa xmm2, [r11] ; xmm2 = pSrc[0] - movdqa xmm3, [r8] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8], xmm2 - - movdqa xmm2, [r11 + 16] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 16], xmm2 - - movdqa xmm2, [r11 + 32] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 32] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 32], xmm2 - - movdqa xmm2, [r11 + 48] ; xmm2 = pSrc[0] - movdqa xmm3, [r8 + 48] ; xmm3 = pDst[0] - pand xmm2, xmm0 ; - pand xmm3, xmm1 ; - por xmm2, xmm3 - movdqa [r8 + 48], xmm2 - - ; Move on to the next digit - lea r11,[r11 + 64] - lea r8,[r8 + 64] - - dec ebp - jnz SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop - - MULT_COMMON_EPILOGUE - - NESTED_END SymCryptFdefMontgomeryReduce1024Asm, _TEXT - - end diff --git a/lib/amd64/fdef_asm.symcryptasm b/lib/amd64/fdef_asm.symcryptasm new file mode 100644 index 0000000..cfe1f53 --- /dev/null +++ b/lib/amd64/fdef_asm.symcryptasm @@ -0,0 +1,2135 @@ +// +// fdef_asm.symcryptasm Assembler code for large integer arithmetic in the default data format +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// + +#include "symcryptasm_shared.cppasm" + + +MACRO_START(MULT_SINGLEADD_128, index, src_reg, dst_reg, Q0, QH, mul_word, even_carry, odd_carry) + // Q0 = mul scratch + // QH = mul scratch + // mul_word = multiplier + // src_reg = running ptr to input + // dst_reg = running ptr to output/scratch + // even_carry = carry for even words (64 bits) + // odd_carry = carry for odd words (64 bits) + + mov Q0, [src_reg + 8*index] + mul mul_word + mov odd_carry, QH + add Q0, even_carry + mov [dst_reg + 8*index], Q0 + adc odd_carry, 0 + + mov Q0, [src_reg + 8*(index+1)] + mul mul_word + mov even_carry, QH + add Q0, odd_carry + mov [dst_reg + 8*(index+1)], Q0 + adc even_carry, 0 +MACRO_END() + +MACRO_START(MULT_DOUBLEADD_128, index, src_reg, dst_reg, Q0, QH, mul_word, even_carry, odd_carry) + // Q0 = mul scratch + // QH = mul scratch + // mul_word = multiplier + // src_reg = running ptr to input + // dst_reg = running ptr to output/scratch + // even_carry = carry for even words (64 bits) + // odd_carry = carry for odd words (64 bits) + + mov Q0, [src_reg + 8*index] + mul mul_word + mov odd_carry, QH + add Q0, [dst_reg + 8*index] + adc odd_carry, 0 + add Q0, even_carry + mov [dst_reg + 8*index], Q0 + adc odd_carry, 0 + + mov Q0, [src_reg + 8*(index+1)] + mul mul_word + mov even_carry, QH + add Q0, [dst_reg + 8*(index+1)] + adc even_carry, 0 + add Q0, odd_carry + mov [dst_reg + 8*(index+1)], Q0 + adc even_carry, 0 +MACRO_END() + +// Squaring + +MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry) + // Q0 = mul scratch + // QH = mul scratch + // mul_word = multiplier + // src_reg = running ptr to input + // dst_reg = running ptr to output/scratch + // src_carry = input carry + // dst_carry = output carry + + mov Q0, [src_reg + 8*index] + mul mul_word + mov dst_carry, QH + add Q0, src_carry + mov [dst_reg + 8*index], Q0 + adc dst_carry, 0 +MACRO_END() + +MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry) + // Q0 = mul scratch + // QH = mul scratch + // mul_word = multiplier + // src_reg = running ptr to input + // dst_reg = running ptr to output/scratch + // src_carry = input carry + // dst_carry = output carry + + mov Q0, [src_reg + 8*index] + mul mul_word + mov dst_carry, QH + add Q0, [dst_reg + 8*index] + adc dst_carry, 0 + add Q0, src_carry + mov [dst_reg + 8*index], Q0 + adc dst_carry, 0 +MACRO_END() + +MACRO_START(SQR_SHIFT_LEFT, index, Q0, src_reg) + mov Q0, [src_reg + 8*index] + adc Q0, Q0 // Shift left and add the carry + mov [src_reg + 8*index], Q0 +MACRO_END() + +MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, Q0, QH, carry) + // Calculating the square + mov Q0, [src_reg + 8*index] // mulword + mul Q0 // m^2 + + // Adding the square to the even column + add Q0, [dst_reg + 16*index] + adc QH, 0 + add Q0, carry + adc QH, 0 + mov [dst_reg + 16*index], Q0 + + // Propagating the sum to the next column + mov Q0, QH + xor QH, QH + + add Q0, [dst_reg + 16*index + 8] + adc QH, 0 + mov [dst_reg + 16*index + 8], Q0 + mov carry, QH +MACRO_END() + +MACRO_START(MONTGOMERY14, Q0, QH, mul_word, pA, R0, R1, R2, R3, Cy) + // (xx, R1, R2, R3, QH) = mul_word * (A0..3) + (R0, R1, R2, R3) + // Used when it is statically known that R0 will get set to 0, so we don't bother computing it + // Cy, Q0 = scratch + + mov Q0, [pA] + mul mul_word + add R0, -1 // set carry flag only when R0 is non-zero + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 8] + mul mul_word + add R1, Q0 + adc QH, 0 + add R1, Cy + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 16] + mul mul_word + add R2, Q0 + adc QH, 0 + add R2, Cy + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 24] + mul mul_word + add R3, Q0 + adc QH, 0 + add R3, Cy + adc QH, 0 +MACRO_END() + +MACRO_START(MUL14, Q0, QH, mul_word, pA, R0, R1, R2, R3, Cy) + // (R0, R1, R2, R3, QH) = mul_word * (A0..3) + (R0, R1, R2, R3) + // Cy, Q0 = scratch + + mov Q0, [pA] + mul mul_word + add R0, Q0 + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 8] + mul mul_word + add R1, Q0 + adc QH, 0 + add R1, Cy + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 16] + mul mul_word + add R2, Q0 + adc QH, 0 + add R2, Cy + adc QH, 0 + mov Cy, QH + + mov Q0, [pA + 24] + mul mul_word + add R3, Q0 + adc QH, 0 + add R3, Cy + adc QH, 0 +MACRO_END() + +// Macros for size-specific squaring +MACRO_START(SQR_DOUBLEADD_64_2, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry) + SQR_DOUBLEADD_64 (index), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry + SQR_DOUBLEADD_64 (index + 1), src_reg, dst_reg, Q0, QH, mul_word, dst_carry, src_carry +MACRO_END() + +MACRO_START(SQR_DOUBLEADD_64_4, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry) + SQR_DOUBLEADD_64_2 (index), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry + SQR_DOUBLEADD_64_2 (index + 2), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry +MACRO_END() + +MACRO_START(SQR_DOUBLEADD_64_8, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry) + SQR_DOUBLEADD_64_4 (index), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry + SQR_DOUBLEADD_64_4 (index + 4), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry +MACRO_END() + +MACRO_START(SQR_SIZE_SPECIFIC_INIT, outer_src_reg, outer_dst_reg, inner_src_reg, inner_dst_reg, mul_word) + lea outer_src_reg, [outer_src_reg + 8] // move outer_src_reg pointer 1 word over + lea outer_dst_reg, [outer_dst_reg + 16] // move outer_dst_reg pointer 2 words over + + mov inner_src_reg, outer_src_reg // inner_src_reg = outer_src_reg + mov inner_dst_reg, outer_dst_reg // inner_dst_reg = outer_dst_reg + + mov mul_word, [outer_src_reg] // Get the next mulword + lea inner_src_reg, [inner_src_reg + 8] // move inner_src_reg pointer 1 word over +MACRO_END() + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdefRawAdd( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst, +// UINT32 nDigits ) + +FUNCTION_START(SymCryptFdefRawAddAsm, 4, 5) + + // loop over each half digit + add D4, D4 + xor Q0, Q0 + +SymCryptFdefRawAddAsmLoop: + // carry is in the carry flag + mov Q0,[Q1] + adc Q0,[Q2] + mov [Q3],Q0 + + mov Q0,[Q1 + 8] + adc Q0,[Q2 + 8] + mov [Q3 + 8], Q0 + + mov Q0,[Q1 + 16] + adc Q0,[Q2 + 16] + mov [Q3 + 16], Q0 + + mov Q0,[Q1 + 24] + adc Q0,[Q2 + 24] + mov [Q3 + 24], Q0 + + lea Q1, [Q1 + 32] + lea Q2, [Q2 + 32] + lea Q3, [Q3 + 32] + dec D4 + jnz SymCryptFdefRawAddAsmLoop + + mov Q0, 0 + adc Q0, Q0 + +FUNCTION_END(SymCryptFdefRawAddAsm) + +//UINT32 +//SYMCRYPT_CALL +//SymCryptFdefRawSub( +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1, +// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2, +// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst, +// UINT32 nDigits ) + +FUNCTION_START(SymCryptFdefRawSubAsm, 4, 5) + + // loop over each half digit + add D4, D4 + xor Q0, Q0 + +SymCryptFdefRawSubAsmLoop: + // carry is in the carry flag + mov Q0,[Q1] + sbb Q0,[Q2] + mov [Q3],Q0 + + mov Q0,[Q1 + 8] + sbb Q0,[Q2 + 8] + mov [Q3 + 8], Q0 + + mov Q0,[Q1 + 16] + sbb Q0,[Q2 + 16] + mov [Q3 + 16], Q0 + + mov Q0,[Q1 + 24] + sbb Q0,[Q2 + 24] + mov [Q3 + 24], Q0 + + lea Q1,[Q1 + 32] + lea Q2,[Q2 + 32] + lea Q3,[Q3 + 32] + dec D4 + jnz SymCryptFdefRawSubAsmLoop + + mov Q0, 0 + adc Q0, Q0 + +FUNCTION_END(SymCryptFdefRawSubAsm) + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefMaskedCopy( +// _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc, +// _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst, +// UINT32 nDigits, +// UINT32 mask ) + +FUNCTION_START(SymCryptFdefMaskedCopyAsm, 4, 4) + + add D3, D3 // loop over half digits + + movd xmm0, D4 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + +SymCryptFdefMaskedCopyAsmLoop: + movdqa xmm2, [Q1] // xmm2 = pSrc[i] + movdqa xmm3, [Q2] // xmm3 = pDst[i] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q2], xmm2 + + movdqa xmm2, [Q1 + 16] // xmm2 = pSrc[i + 16] + movdqa xmm3, [Q2 + 16] // xmm3 = pDst[i + 16] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q2 + 16], xmm2 + + // Move on to the next digit + + add Q1, 32 + add Q2, 32 + dec D3 + jnz SymCryptFdefMaskedCopyAsmLoop + +FUNCTION_END(SymCryptFdefMaskedCopyAsm) + + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawMul( +// _In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// UINT32 nDigits1, +// _In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits2, +// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawMulAsm, 5, 12) + + shl Q2, 3 // nDigits1 * 8 = # words in Src1 to process + + // Basic structure: + // for each word in Src1: + // Dst += Src2 * word + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = pSrc1 (updated in outer loop) + // Q2 = # words left from Src1 to process + // Q3 = pSrc2 + // Q4 = nDigits2 + // Q5 = pDst (incremented in outer loop) + // Q6 = inner loop pointer into pSrc2 + // Q7 = inner loop pointer into pDst + // Q8 = word from Src1 to multiply with + // Q9 = carry for even words (64 bits) + // Q10 = inner loop counter + // Q11 = carry for odd words (64 bits) + + + // Outer loop invariant established: Q1, Q3, Q4, Q5 + + mov Q6, Q3 // Q6 = pSrc2 + mov Q7, Q5 // Q7 = pDst + outer loop ctr + mov Q8, [Q1] // mulword + xor Q9, Q9 + mov Q10, Q4 + + // First inner loop overwrites Dst, which avoids adding the current Dst value + +ALIGN(16) + +SymCryptFdefRawMulAsmLoop1: + MULT_SINGLEADD_128 0, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_SINGLEADD_128 2, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_SINGLEADD_128 4, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_SINGLEADD_128 6, Q6, Q7, Q0, QH, Q8, Q9, Q11 + + lea Q6,[Q6 + 64] + lea Q7,[Q7 + 64] + + dec Q10 + jnz SymCryptFdefRawMulAsmLoop1 + + mov [Q7], Q9 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q2 + +ALIGN(16) + +SymCryptFdefRawMulAsmLoopOuter: + + add Q1, 8 // move to next word of pSrc1 + add Q5, 8 // move Dst pointer one word over + mov Q8, [Q1] + mov Q6, Q3 + mov Q7, Q5 + xor Q9, Q9 + mov Q10, Q4 + +ALIGN(16) + +SymCryptFdefRawMulAsmLoop2: + MULT_DOUBLEADD_128 0, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_DOUBLEADD_128 2, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_DOUBLEADD_128 4, Q6, Q7, Q0, QH, Q8, Q9, Q11 + MULT_DOUBLEADD_128 6, Q6, Q7, Q0, QH, Q8, Q9, Q11 + + lea Q6,[Q6 + 64] + lea Q7,[Q7 + 64] + + dec Q10 + jnz SymCryptFdefRawMulAsmLoop2 + + mov [Q7], Q9 // write next word. (stays within Dst buffer) + + dec Q2 + jnz SymCryptFdefRawMulAsmLoopOuter + +MUL_FUNCTION_END(SymCryptFdefRawMulAsm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawSquareAsm( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawSquareAsm, 3, 13) + + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = outer loop pointer into pSrc + // Q2 = nDigits (constant) + // Q3 = pDst (constant) + // Q4 = inner loop pointer into pSrc + // Q5 = inner loop pointer into pDst + // Q6 = word from Src to multiply with + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = outer loop pointer into pDst + // Q10 = outer loop counter of #words left + // Q11 = inner loop counter of #words left + // Q12 = cyclic counter that specifies on which branch we jump into + + //////////////////////////////////////////////////////////////// + // First Pass - Addition of the cross products x_i*x_j with i!=j + //////////////////////////////////////////////////////////////// + // + // At the beginning of each inner loop we will jump over the + // words that don't need processing. The decision of the jump + // will be based on the cyclic counter Q12. + // + // For the first pass we loop over **half** digits since having a smaller + // number of jumps (i.e. 4) is actually faster than having 8 jumps. + // + //////////////////////////////////////////////////////////////// + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1 // save pSrc + + mov Q10, Q2 // nDigits + shl Q10, 3 // Q10 = outer #words + mov Q9, Q3 // Q9 = outer pDst + + mov Q4, Q1 // Q4 = inner pSrc + mov Q5, Q3 // Q5 = inner pDst + + // Initial inner loop overwrites Dst, which avoids adding the current Dst value + + mov Q6, [Q1] // mulword + + xor Q7, Q7 // carry = 0 + xor Q8, Q8 // carry = 0 + + mov Q11, Q10 // Q11 = inner #words + mov [Q5], Q7 // Write 0 in the first word + + // Skip over the first word + jmp SymCryptFdefRawSquareAsmInnerLoopInit_Word1 + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoopInit_Word0: + SQR_SINGLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoopInit_Word1: + SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + + SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + lea Q4, [Q4 + 32] + lea Q5, [Q5 + 32] + sub Q11, 4 + jnz SymCryptFdefRawSquareAsmInnerLoopInit_Word0 + + mov [Q5], Q7 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q10 // Counter for the outer loop + mov Q12, 1 // Cyclic counter Q12 = 1 + +ALIGN(16) +SymCryptFdefRawSquareAsmLoopOuter: + + add Q9, 8 // move Dst pointer 1 word over + + mov Q4, Q1 // Q4 = inner pSrc + mov Q5, Q9 // Q5 = inner pDst + + mov Q6, [Q1 + 8*Q12] // Get the next mulword + + inc B12 // Increment the cyclic counter by 1 + + mov Q11, Q10 // # of words for the inner loop + add Q11, 2 + and Q11, -4 // Zero out the 2 lower bits + + xor Q7, Q7 // carry = 0 + xor Q8, Q8 // carry = 0 + + // Logic to find the correct jump + cmp B12, 3 + je SymCryptFdefRawSquareAsmInnerLoop_Word3 + cmp B12, 2 + je SymCryptFdefRawSquareAsmInnerLoop_Word2 + cmp B12, 1 + je SymCryptFdefRawSquareAsmInnerLoop_Word1 + + // The following instructions are only executed when B12 == 4 + xor B12, B12 // Set it to 0 for the next iteration + + add Q1, 32 // move pSrc 4 words over + add Q9, 32 // move destination 4 words over + + mov Q4, Q1 // Q4 = inner pSrc + mov Q5, Q9 // Q5 = inner pDst + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoop_Word0: + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoop_Word1: + SQR_DOUBLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7 + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoop_Word2: + SQR_DOUBLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + +ALIGN(16) +SymCryptFdefRawSquareAsmInnerLoop_Word3: + SQR_DOUBLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + lea Q4, [Q4 + 32] + lea Q5, [Q5 + 32] + sub Q11, 4 + jnz SymCryptFdefRawSquareAsmInnerLoop_Word0 + + mov [Q5], Q7 // write next word. (stays within Dst buffer) + + dec Q10 + cmp Q10, 1 + jne SymCryptFdefRawSquareAsmLoopOuter + + xor QH, QH + mov [Q9 + 40], QH // Final word = 0 + + + //////////////////////////////////////////////////////////////// + // Second Pass - Shifting all results 1 bit left + //////////////////////////////////////////////////////////////// + + mov Q10, Q2 // nDigits + mov Q5, Q3 // pDst pointer + shl Q10, 1 // 2*nDigits + +ALIGN(16) +SymCryptFdefRawSquareAsmSecondPass: + SQR_SHIFT_LEFT 0, Q0, Q5 + SQR_SHIFT_LEFT 1, Q0, Q5 + SQR_SHIFT_LEFT 2, Q0, Q5 + SQR_SHIFT_LEFT 3, Q0, Q5 + + SQR_SHIFT_LEFT 4, Q0, Q5 + SQR_SHIFT_LEFT 5, Q0, Q5 + SQR_SHIFT_LEFT 6, Q0, Q5 + SQR_SHIFT_LEFT 7, Q0, Q5 + + lea Q5, [Q5 + 64] + dec Q10 + jnz SymCryptFdefRawSquareAsmSecondPass + + ////////////////////////////////////////////////////////////////////////////// + // Third Pass - Adding the squares on the even columns and propagating the sum + ////////////////////////////////////////////////////////////////////////////// + + mov Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)] // Q1 = pSrc + +SymCryptFdefRawSquareAsmThirdPass: + SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q10 + SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q10 + + add Q1, 64 // One digit up + add Q3, 128 // Two digits up + dec Q2 + jnz SymCryptFdefRawSquareAsmThirdPass + +MUL_FUNCTION_END(SymCryptFdefRawSquareAsm) + +//VOID +//SymCryptFdefMontgomeryReduceAsm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceAsm, 3, 14) + + mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits + mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + mov D13, D4 // outer loop counter + shl D13, 3 // D13 is in words + + xor D9, D9 + + // General register allocations + // Q0 = multiply result + // QH = multiply result + // Q1 = pointer to modulus value + // Q2 = pSrc (updated in outer loop) + // Q3 = pDst + // D4 = nDigits + // Q5 = pmMod->tm.montgomery.inv64 + // Q6 = multiplier in inner loop + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = carry out from last word of previous loop iteration + // Q10 = running pointer in Src + // Q11 = running pointer in Mod + // Q12 = loop counter + // Q13 = loop counter + +ALIGN(16) + +SymCryptFdefMontgomeryReduceAsmOuterLoop: + + // start decoder with a few simple instructions, including at least one that requires + // a uop execution and is on the critical path + + mov Q6, [Q2] // fetch word of Src we want to set to zero + mov Q11, Q2 + mov Q10, Q1 + + imul Q6, Q5 // lower word is same for signed & unsigned multiply + + mov D12, D4 + xor D7, D7 + +ALIGN(16) + +SymCryptFdefMontgomeryReduceAsmInnerloop: + // Q0 = mul scratch + // QH = mul scratch + // Q6 = multiplier + // Q1 = pointer to modulus value + // D13 = outer loop counter (words) + // D12 = inner loop counter (digits) + // Q10 = running ptr to modulus + // Q11 = running ptr to input/scratch + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + + MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8 + + lea Q10,[Q10 + 64] + lea Q11,[Q11 + 64] + + dec D12 + jnz SymCryptFdefMontgomeryReduceAsmInnerloop + + add Q7, Q9 + mov D9, 0 + adc Q9, 0 + add Q7, [Q11] + adc Q9, 0 + mov [Q11], Q7 + + lea Q2,[Q2 + 8] + + dec D13 + jnz SymCryptFdefMontgomeryReduceAsmOuterLoop + + // + // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result + // + + // First we compute the pSrc result minus the modulus into the destination + mov D12, D4 // loop ctr + mov Q11, Q2 // pSrc + mov Q10, Q1 // pMod + mov Q7, Q3 // pDst + + // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2 + +ALIGN(16) + +SymCryptFdefMontgomeryReduceAsmSubLoop: + mov Q0,[Q11] + sbb Q0,[Q10] + mov [Q7], Q0 + + mov Q0,[Q11 + 8] + sbb Q0,[Q10 + 8] + mov [Q7 + 8], Q0 + + mov Q0,[Q11 + 16] + sbb Q0,[Q10 + 16] + mov [Q7 + 16], Q0 + + mov Q0,[Q11 + 24] + sbb Q0,[Q10 + 24] + mov [Q7 + 24], Q0 + + mov Q0,[Q11 + 32] + sbb Q0,[Q10 + 32] + mov [Q7 + 32], Q0 + + mov Q0,[Q11 + 40] + sbb Q0,[Q10 + 40] + mov [Q7 + 40], Q0 + + mov Q0,[Q11 + 48] + sbb Q0,[Q10 + 48] + mov [Q7 + 48], Q0 + + mov Q0,[Q11 + 56] + sbb Q0,[Q10 + 56] + mov [Q7 + 56], Q0 + + lea Q11,[Q11 + 64] + lea Q10,[Q10 + 64] + lea Q7,[Q7 + 64] + + dec D12 + jnz SymCryptFdefMontgomeryReduceAsmSubLoop + + // Finally a masked copy form pSrc to pDst + // copy if: Q9 == 0 && Cy = 1 + sbb D9, 0 + + movd xmm0, D9 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + +ALIGN(16) + +SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop: + movdqa xmm2, [Q2] // xmm2 = pSrc[0] + movdqa xmm3, [Q3] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3], xmm2 + + movdqa xmm2, [Q2 + 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 16] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 16], xmm2 + + movdqa xmm2, [Q2 + 32] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 32] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 32], xmm2 + + movdqa xmm2, [Q2 + 48] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 48] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 48], xmm2 + + // Move on to the next digit + lea Q2,[Q2 + 64] + lea Q3,[Q3 + 64] + + dec D4 + jnz SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceAsm) + + +// -------------------------------- +// 256-bit size specific functions +// -------------------------------- + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefModAdd256( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PCSYMCRYPT_MODELEMENT peSrc1, +// _In_ PCSYMCRYPT_MODELEMENT peSrc2, +// _Out_ PSYMCRYPT_MODELEMENT peDst, +// _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, +// SIZE_T cbScratch ) + +FUNCTION_START(SymCryptFdefModAdd256Asm, 4, 11) + + // Q1 = pmMod + // Q2 = peSrc1 + // Q3 = peSrc2 + // Q4 = peDst + + // compute Src1 + Src2 into (Q0, Q5, Q6, Q7) with carry out mask in Q8 + + mov Q0, [Q2] + add Q0, [Q3 ] + mov Q5, [Q2 + 8] + adc Q5, [Q3 + 8] + mov Q6, [Q2 + 16] + adc Q6, [Q3 + 16] + mov Q7, [Q2 + 24] + adc Q7, [Q3 + 24] + sbb Q8, Q8 // Q8 = carry out mask + + // Q2, Q3: free + // Compute sum - Mod into (Q2, Q3, Q9, Q10) = sum - modulus, Q1 = carry out mask + + add Q1, SymCryptModulusValueOffsetAmd64 + + mov Q2, Q0 + sub Q2, [Q1] + mov Q3, Q5 + sbb Q3, [Q1 + 8] + mov Q9, Q6 + sbb Q9, [Q1 + 16] + mov Q10, Q7 + sbb Q10, [Q1 + 24] + + sbb Q1, Q1 // Q1 = carry out mask + + // Choose between the two + // addition carry = 1, then subtraction carry = 1 and we pick the 2nd result. + // addition carry = 0 and subtraction carry = 0: pick 2nd result + // addition carry = 0 and subtraction carry = 1: pick first result + + xor Q1, Q8 // 0 = 2nd result, 1 = first result + + xor Q0, Q2 + xor Q5, Q3 + xor Q6, Q9 + xor Q7, Q10 + + and Q0, Q1 + and Q5, Q1 + and Q6, Q1 + and Q7, Q1 + + xor Q2, Q0 + xor Q3, Q5 + xor Q9, Q6 + xor Q10, Q7 + + mov [Q4 + 0], Q2 + mov [Q4 + 8], Q3 + mov [Q4 + 16], Q9 + mov [Q4 + 24], Q10 + +FUNCTION_END(SymCryptFdefModAdd256Asm) + + +FUNCTION_START(SymCryptFdefModSub256Asm, 4, 10) + + // Q1 = pmMod + // Q2 = peSrc1 + // Q3 = peSrc2 + // Q4 = peDst + + // compute Src1 - Src2 into (Q0, Q5, Q6, Q7) with carry out mask in Q8 + + mov Q0, [Q2] + sub Q0, [Q3] + mov Q5, [Q2 + 8] + sbb Q5, [Q3 + 8] + mov Q6, [Q2 + 16] + sbb Q6, [Q3 + 16] + mov Q7, [Q2 + 24] + sbb Q7, [Q3 + 24] + sbb Q8, Q8 // Q8 = carry out mask + + // Q2, Q3: free + // Load Mod into (Q2, Q3, Q9, Q1) + + add Q1, SymCryptModulusValueOffsetAmd64 + + mov Q2, [Q1] + mov Q3, [Q1 + 8] + mov Q9, [Q1 + 16] + mov Q1, [Q1 + 24] + + // Mask the value to be added to zero if there was no underflow + and Q2, Q8 + and Q3, Q8 + and Q9, Q8 + and Q1, Q8 + + // Add the (masked) modulus + add Q0, Q2 + adc Q5, Q3 + adc Q6, Q9 + adc Q7, Q1 + + mov [Q4 + 0], Q0 + mov [Q4 + 8], Q5 + mov [Q4 + 16], Q6 + mov [Q4 + 24], Q7 + +FUNCTION_END(SymCryptFdefModSub256Asm) + +//================================================= +// Multiplication +// + +#if defined(SYMCRYPT_MASM) +altentry SymCryptFdefMontgomeryReduce256AsmInternal +#endif + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefModMulMontgomery256Asm( +// _In_ PCSYMCRYPT_MODULUS pMod, +// _In_ PCSYMCRYPT_MODELEMENT pSrc1, +// _In_ PCSYMCRYPT_MODELEMENT pSrc2, +// _Out_ PSYMCRYPT_MODELEMENT pDst, +// _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, +// SIZE_T cbScratch ) + +// Note we specify only 4 arguments as we never use arguments 5 and 6 (saves some prolog code in MSFT calling convention) +MUL_FUNCTION_START(SymCryptFdefModMulMontgomery256Asm, 4, 14) + + // Q1 = pMod + // Q2 = pSrc1 + // Q3 = pSrc2 + // Q4 = pDst + + // First we compute the product. The result will be in 8 registers + // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 + + mov Q5, [Q2] + xor Q8, Q8 + xor Q9, Q9 + xor Q10, Q10 + + mov Q0, [Q3] + mul Q5 + mov Q6, Q0 + mov Q7, QH + + mov Q0, [Q3 + 8] + mul Q5 + add Q7, Q0 + adc Q8, QH + + mov Q0, [Q3 + 16] + mul Q5 + add Q8, Q0 + adc Q9, QH + + mov Q0, [Q3 + 24] + mul Q5 + add Q9, Q0 + adc Q10, QH + + // Second row + mov Q5, [Q2 + 8] + MUL14 Q0, QH, Q5, Q3, Q7, Q8, Q9, Q10, Q13 + mov Q11, QH + + // third row + mov Q5, [Q2 + 16] + MUL14 Q0, QH, Q5, Q3, Q8, Q9, Q10, Q11, Q13 + mov Q12, QH + + // fourth row + mov Q5, [Q2 + 24] + MUL14 Q0, QH, Q5, Q3, Q9, Q10, Q11, Q12, Q13 + mov Q13, QH + +ALTERNATE_ENTRY(SymCryptFdefMontgomeryReduce256AsmInternal) + // Invariant: + // common prologue used + // 512-bit result in (Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13) + // Q1 = pmMod + // Q4 = pDst + + mov Q3, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + add Q1, SymCryptModulusValueOffsetAmd64 + + mov Q5, Q6 + imul Q5, Q3 // lower word is the same for signed & unsigned multiply - Q5 = multiplicand for first row + MONTGOMERY14 Q0, QH, Q5, Q1, Q6, Q7, Q8, Q9, Q6 + mov Q6, QH // Save the out carries in (eventually) (Q6, Q7, Q8, Q9) + + mov Q5, Q7 + imul Q5, Q3 + MONTGOMERY14 Q0, QH, Q5, Q1, Q7, Q8, Q9, Q10, Q7 + mov Q7, QH // Save the out carries in (eventually) (Q6, Q7, Q8, Q9) + + mov Q5, Q8 + imul Q5, Q3 + MONTGOMERY14 Q0, QH, Q5, Q1, Q8, Q9, Q10, Q11, Q8 + mov Q8, QH + + mov Q5, Q9 + imul Q5, Q3 + MONTGOMERY14 Q0, QH, Q5, Q1, Q9, Q10, Q11, Q12, Q9 + // mov Q9, QH + + add Q10, Q6 + adc Q11, Q7 + adc Q12, Q8 + adc Q13, QH + + sbb Q5, Q5 // Carry out from final addition in mask form + + // reduced value in (Q10, Q11, Q12, Q13, -Q5), and it is less than 2*Modulus + + mov Q6, Q10 + sub Q6, [Q1] + mov Q7, Q11 + sbb Q7, [Q1 + 8] + mov Q8, Q12 + sbb Q8, [Q1 + 16] + mov Q9, Q13 + sbb Q9, [Q1 + 24] + + sbb Q1, Q1 // Q1 = carry out mask + + // Choose between the two + // addition carry = 1, then subtraction carry = 1 and we pick the 2nd result. + // addition carry = 0 and subtraction carry = 0: pick 2nd result + // addition carry = 0 and subtraction carry = 1: pick first result + + xor Q1, Q5 // 0 = 2nd result, 1 = first result + + xor Q10, Q6 + xor Q11, Q7 + xor Q12, Q8 + xor Q13, Q9 + + and Q10, Q1 + and Q11, Q1 + and Q12, Q1 + and Q13, Q1 + + xor Q6, Q10 + xor Q7, Q11 + xor Q8, Q12 + xor Q9, Q13 + + mov [Q4 + 0], Q6 + mov [Q4 + 8], Q7 + mov [Q4 + 16], Q8 + mov [Q4 + 24], Q9 + +MUL_FUNCTION_END(SymCryptFdefModMulMontgomery256Asm) + + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefMontgomeryReduce256Asm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) + +// Note we specify 4 arguments so that our prolog matches SymCryptFdefModMulMontgomery256Asm +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce256Asm, 4, 14) + + mov Q4, Q3 + mov Q6, [Q2 + 0] + mov Q7, [Q2 + 8] + mov Q8, [Q2 + 16] + mov Q9, [Q2 + 24] + mov Q10, [Q2 + 32] + mov Q11, [Q2 + 40] + mov Q12, [Q2 + 48] + mov Q13, [Q2 + 56] + + // Normal code doesn't jump from the body of one function to the body of another function. + // Here we have ensured that our stack frames are identical, so it is safe. + // We just have to convince the other system components that this works... + + // Use conditional jump so that stack unwinder doesn't think it is an epilogue + test rsp,rsp + jne SymCryptFdefMontgomeryReduce256AsmInternal // jumps always + + int 3 // Dummy instruction because the debugger seems to have an off-by-one + // error and still see the (wrong) epilogue when on the JNE instruction + // Best guess: the debugger starts the stack trace *after* the current instruction + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce256Asm) + + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefModSquareMontgomery256( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PCSYMCRYPT_MODELEMENT peSrc, +// _Out_ PSYMCRYPT_MODELEMENT peDst, +// _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, +// SIZE_T cbScratch ) + +// Note we specify 4 arguments so that our prolog matches SymCryptFdefModMulMontgomery256Asm +MUL_FUNCTION_START(SymCryptFdefModSquareMontgomery256Asm, 4, 14) + + // Result in Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 + + // Q1 = pmMod + // Q2 = peSrc + // Q3 = peDst + + mov Q4, Q3 + mov Q5, [Q2] + xor Q9, Q9 + xor Q10, Q10 + xor Q11, Q11 + xor Q12, Q12 + + // First we compute all the terms that need doubling + + mov Q0, [Q2 + 8] + mul Q5 + mov Q7, Q0 + mov Q8, QH + + mov Q0, [Q2 + 16] + mul Q5 + add Q8, Q0 + adc Q9, QH + + mov Q0, [Q2 + 24] + mul Q5 + add Q9, Q0 + adc Q10, QH + + mov Q5, [Q2 + 8] + mov Q0, [Q2 + 16] + mul Q5 + add Q9, Q0 + adc QH, 0 + mov Q13, QH + + mov Q0, [Q2 + 24] + mul Q5 + add Q10, Q0 + adc QH, 0 + add Q10, Q13 + adc Q11, QH + + mov Q5, [Q2 + 16] + mov Q0, [Q2 + 24] + mul Q5 + add Q11, Q0 + adc Q12, QH // no overflow from this + + // double these terms + xor Q13, Q13 + + add Q7, Q7 + adc Q8, Q8 + adc Q9, Q9 + adc Q10, Q10 + adc Q11, Q11 + adc Q12, Q12 + adc Q13, 0 + + mov Q0, [Q2] + mul Q0 + mov Q6, Q0 + mov Q5, QH + + mov Q0, [Q2 + 8] + mul Q0 + + add Q7, Q5 + adc Q8, Q0 + adc Q9, QH + sbb Q3, Q3 // -carry + + mov Q0, [Q2 + 16] + mul Q0 + + add Q3, Q3 + adc Q10, Q0 + adc Q11, QH + sbb Q3, Q3 + + mov Q0, [Q2 + 24] + mul Q0 + add Q3, Q3 + adc Q12, Q0 + adc Q13, QH + + // See SymCryptFdefMontgomeryReduce256Asm for a discussion of this strange epilogue sequence + test rsp,rsp + jne SymCryptFdefMontgomeryReduce256AsmInternal // jumps always + + int 3 + +MUL_FUNCTION_END(SymCryptFdefModSquareMontgomery256Asm) + +// -------------------------------- +// 512-bit size specific functions +// -------------------------------- + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawMul512Asm( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits, +// _Out_writes_(2*nWords) PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefRawMul512Asm, 4, 8) + + // Basic structure: + // for each word in Src1: + // Dst += Src2 * word + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = pSrc1 (updated in outer loop) + // Q2 = pSrc2 (constant) + // Q3 = # words left from Src1 to process + // Q4 = pDst (incremented in outer loop) + // Q5 = word from Src1 to multiply with + // Q6 = carry for even words (64 bits) + // Q7 = carry for odd words (64 bits) + + shl Q3, 3 // nDigits * 8 = # words in Src1 to process + + mov Q5, [Q1] // mulword + xor Q6, Q6 // carry + + // First inner loop overwrites Dst, which avoids adding the current Dst value + MULT_SINGLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + mov [Q4 + 64], Q6 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q3 + +ALIGN(16) + +SymCryptFdefRawMul512AsmLoopOuter: + + lea Q1, [Q1 + 8] // move to next word of pSrc1 + lea Q4, [Q4 + 8] // move Dst pointer one word over + + mov Q5, [Q1] // mulword + xor Q6, Q6 // carry + + MULT_DOUBLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + mov [Q4 + 64], Q6 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q3 + jnz SymCryptFdefRawMul512AsmLoopOuter + +MUL_FUNCTION_END(SymCryptFdefRawMul512Asm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawSquareAsm( +// _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nWords) PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefRawSquare512Asm, 3, 11) + + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = outer loop pointer into pSrc + // Q2 = nDigits (constant) + // Q3 = pDst (constant) + // Q4 = inner loop pointer into pSrc + // Q5 = inner loop pointer into pDst + // Q6 = word from Src to multiply with + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = outer loop pointer into pDst + // Q10 = outer loop counter of #words left + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1 // save pSrc + + //////////////////////////////////////////////////////////////// + // First Pass - Addition of the cross products x_i*x_j with i!=j + //////////////////////////////////////////////////////////////// + + mov Q9, Q3 // Q9 = outer pDst + + mov Q4, Q1 // Q4 = inner pSrc + mov Q5, Q3 // Q5 = inner pDst + + // Initial inner loop overwrites Dst, which avoids adding the current Dst value + // 7 iterations + xor Q8, Q8 // carry = 0 (for "odd" iterations set only the Q8 carry) + mov Q6, [Q1] // mulword + mov [Q5], Q8 // Write 0 in the first word + + SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + SQR_SINGLEADD_64 4, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 5, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 6, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 7, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + mov [Q5 + 8*8], Q7 // write last word, cannot overflow because Dst is at least 2 digits long + add Q9, 8 // Skip over the first word + + // 6 iterations + xor Q7, Q7 // carry = 0 (for "even" iterations set only the Q7 carry) + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 6*8], Q7 + + // 5 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 // Notice the dst_carry is Q7 since all the "double" macros have Q7 as src_carry + SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 5*8], Q7 + + // 4 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 4*8], Q7 + + // 3 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 3*8], Q7 + + // 2 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 2*8], Q7 + + // 1 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + mov [Q5 + 8], Q7 + + xor QH, QH + mov [Q5 + 16], QH // Final word = 0 + + + //////////////////////////////////////////////////////////////// + // Second Pass - Shifting all results 1 bit left + //////////////////////////////////////////////////////////////// + + xor Q0, Q0 // carry flag = 0 + mov Q5, Q3 // pDst pointer + + SQR_SHIFT_LEFT 0, Q0, Q5 + SQR_SHIFT_LEFT 1, Q0, Q5 + SQR_SHIFT_LEFT 2, Q0, Q5 + SQR_SHIFT_LEFT 3, Q0, Q5 + + SQR_SHIFT_LEFT 4, Q0, Q5 + SQR_SHIFT_LEFT 5, Q0, Q5 + SQR_SHIFT_LEFT 6, Q0, Q5 + SQR_SHIFT_LEFT 7, Q0, Q5 + + SQR_SHIFT_LEFT 8, Q0, Q5 + SQR_SHIFT_LEFT 9, Q0, Q5 + SQR_SHIFT_LEFT 10, Q0, Q5 + SQR_SHIFT_LEFT 11, Q0, Q5 + + SQR_SHIFT_LEFT 12, Q0, Q5 + SQR_SHIFT_LEFT 13, Q0, Q5 + SQR_SHIFT_LEFT 14, Q0, Q5 + SQR_SHIFT_LEFT 15, Q0, Q5 + + ////////////////////////////////////////////////////////////////////////////// + // Third Pass - Adding the squares on the even columns and propagating the sum + ////////////////////////////////////////////////////////////////////////////// + + mov Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)] // Q1 = pSrc + xor Q7, Q7 + + SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q7 + +MUL_FUNCTION_END(SymCryptFdefRawSquare512Asm) + +//VOID +//SymCryptFdefMontgomeryReduce512Asm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce512Asm, 3, 13) + + mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits + mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + mov D12, D4 // outer loop counter + shl D12, 3 // D12 is in words + + xor D9, D9 + + // General register allocations + // Q0 = multiply result + // QH = multiply result + // Q1 = pointer to modulus value + // Q2 = pSrc (updated in outer loop) + // Q3 = pDst + // D4 = nDigits + // Q5 = pmMod->tm.montgomery.inv64 + // Q6 = multiplier in inner loop + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = carry out from last word of previous loop iteration + // Q10 = running pointer in Src + // Q11 = running pointer in Mod + // D12 = loop counter + +ALIGN(16) + +SymCryptFdefMontgomeryReduce512AsmOuterLoop: + + // start decoder with a few simple instructions, including at least one that requires + // a uop execution and is on the critical path + + mov Q6, [Q2] // fetch word of Src we want to set to zero + mov Q11, Q2 + mov Q10, Q1 + + imul Q6, Q5 // lower word is same for signed & unsigned multiply + + xor D7, D7 + + // Q0 = mul scratch + // QH = mul scratch + // Q1 = pointer to modulus value + // Q6 = multiplier + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q10 = running ptr to modulus + // Q11 = running ptr to input/scratch + // D12 = outer loop counter (words) + + MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8 + + lea Q11,[Q11 + 64] + + add Q7, Q9 + mov D9, 0 + adc Q9, 0 + add Q7, [Q11] + adc Q9, 0 + mov [Q11], Q7 + + lea Q2,[Q2 + 8] + + dec D12 + jnz SymCryptFdefMontgomeryReduce512AsmOuterLoop + + // + // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result + // + + // First we compute the pSrc result minus the modulus into the destination + mov Q11, Q2 // pSrc + mov Q10, Q1 // pMod + mov Q7, Q3 // pDst + + // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2 + mov Q0,[Q11] + sbb Q0,[Q10] + mov [Q7], Q0 + + mov Q0,[Q11 + 8] + sbb Q0,[Q10 + 8] + mov [Q7 + 8], Q0 + + mov Q0,[Q11 + 16] + sbb Q0,[Q10 + 16] + mov [Q7 + 16], Q0 + + mov Q0,[Q11 + 24] + sbb Q0,[Q10 + 24] + mov [Q7 + 24], Q0 + + mov Q0,[Q11 + 32] + sbb Q0,[Q10 + 32] + mov [Q7 + 32], Q0 + + mov Q0,[Q11 + 40] + sbb Q0,[Q10 + 40] + mov [Q7 + 40], Q0 + + mov Q0,[Q11 + 48] + sbb Q0,[Q10 + 48] + mov [Q7 + 48], Q0 + + mov Q0,[Q11 + 56] + sbb Q0,[Q10 + 56] + mov [Q7 + 56], Q0 + + lea Q11,[Q11 + 64] + lea Q10,[Q10 + 64] + lea Q7,[Q7 + 64] + + // Finally a masked copy from pSrc to pDst + // copy if: Q9 == 0 && Cy = 1 + sbb D9, 0 + + movd xmm0, D9 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + +ALIGN(16) + +SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop: + movdqa xmm2, [Q2] // xmm2 = pSrc[0] + movdqa xmm3, [Q3] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3], xmm2 + + movdqa xmm2, [Q2 + 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 16] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 16], xmm2 + + movdqa xmm2, [Q2 + 32] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 32] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 32], xmm2 + + movdqa xmm2, [Q2 + 48] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 48] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 48], xmm2 + + // Move on to the next digit + lea Q2,[Q2 + 64] + lea Q3,[Q3 + 64] + + dec D4 + jnz SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce512Asm) + +// -------------------------------- +// 1024-bit size specific functions +// -------------------------------- + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawMul1024Asm( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits, +// _Out_writes_(2*nWords) PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefRawMul1024Asm, 4, 8) + + // Basic structure: + // for each word in Src1: + // Dst += Src2 * word + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = pSrc1 (updated in outer loop) + // Q2 = pSrc2 (constant) + // Q3 = # words left from Src1 to process + // Q4 = pDst (incremented in outer loop) + // Q5 = word from Src1 to multiply with + // Q6 = carry for even words (64 bits) + // Q7 = carry for odd words (64 bits) + + shl Q3, 3 // nDigits * 8 = # words in Src1 to process + + mov Q5, [Q1] // mulword + xor Q6, Q6 // carry + + // First inner loop overwrites Dst, which avoids adding the current Dst value + MULT_SINGLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + MULT_SINGLEADD_128 8, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 10, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 12, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_SINGLEADD_128 14, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + mov [Q4 + 128], Q6 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q3 + +ALIGN(16) + +SymCryptFdefRawMul1024AsmLoopOuter: + + lea Q1, [Q1 + 8] // move to next word of pSrc1 + lea Q4, [Q4 + 8] // move Dst pointer one word over + + mov Q5, [Q1] // mulword + + xor Q6, Q6 // carry + + MULT_DOUBLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + MULT_DOUBLEADD_128 8, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 10, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 12, Q2, Q4, Q0, QH, Q5, Q6, Q7 + MULT_DOUBLEADD_128 14, Q2, Q4, Q0, QH, Q5, Q6, Q7 + + mov [Q4 + 128], Q6 // write last word, cannot overflow because Dst is at least 2 digits long + + dec Q3 + jnz SymCryptFdefRawMul1024AsmLoopOuter + +MUL_FUNCTION_END(SymCryptFdefRawMul1024Asm) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawSquareAsm( +// _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nWords) PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefRawSquare1024Asm, 3, 11) + + // Register assignments + // + // Q0 = tmp for mul + // QH = tmp for mul + // Q1 = outer loop pointer into pSrc + // Q2 = nDigits (constant) + // Q3 = pDst (constant) + // Q4 = inner loop pointer into pSrc + // Q5 = inner loop pointer into pDst + // Q6 = word from Src to multiply with + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = outer loop pointer into pDst + // Q10 = outer loop counter of #words left + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1 // save pSrc + + //////////////////////////////////////////////////////////////// + // First Pass - Addition of the cross products x_i*x_j with i!=j + //////////////////////////////////////////////////////////////// + + mov Q9, Q3 // Q9 = outer pDst + + mov Q4, Q1 // Q4 = inner pSrc + mov Q5, Q3 // Q5 = inner pDst + + // Initial inner loop overwrites Dst, which avoids adding the current Dst value + + // 15 iterations + xor Q8, Q8 // carry = 0 (for "odd" iterations set only the Q8 carry) + mov Q6, [Q1] // mulword + mov [Q5], Q8 // Write 0 in the first word + + SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + SQR_SINGLEADD_64 4, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 5, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 6, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 7, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + SQR_SINGLEADD_64 8, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 9, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 10, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 11, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + SQR_SINGLEADD_64 12, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 13, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_SINGLEADD_64 14, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_SINGLEADD_64 15, Q4, Q5, Q0, QH, Q6, Q8, Q7 + + mov [Q5 + 16*8], Q7 // write last word, cannot overflow because Dst is at least 2 digits long + add Q9, 8 // Skip over the first word + + // 14 iterations (adding the current Dst value) + xor Q7, Q7 // carry = 0 (for "even" iterations set only the Q7 carry) + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_8 6, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 14*8], Q7 + + // 13 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 // Notice the dst_carry is Q7 since all the "double" macros have Q7 as src_carry + SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_8 5, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 13*8], Q7 + + // 12 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_8 4, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 12*8], Q7 + + // 11 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_8 3, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 11*8], Q7 + + // 10 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_8 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 10*8], Q7 + + // 9 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_8 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 9*8], Q7 + + // 8 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_8 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 8*8], Q7 + + // 7 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_4 3, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 7*8], Q7 + + // 6 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 6*8], Q7 + + // 5 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 5*8], Q7 + + // 4 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 4*8], Q7 + + // 3 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 3*8], Q7 + + // 2 iterations + xor Q7, Q7 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8 + mov [Q5 + 2*8], Q7 + + // 1 iterations + xor Q8, Q8 + SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6 + SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7 + mov [Q5 + 8], Q7 + + xor QH, QH + mov [Q5 + 16], QH // Final word = 0 + + + //////////////////////////////////////////////////////////////// + // Second Pass - Shifting all results 1 bit left + //////////////////////////////////////////////////////////////// + + xor Q0, Q0 // carry flag = 0 + mov Q5, Q3 // pDst pointer + + SQR_SHIFT_LEFT 0, Q0, Q5 + SQR_SHIFT_LEFT 1, Q0, Q5 + SQR_SHIFT_LEFT 2, Q0, Q5 + SQR_SHIFT_LEFT 3, Q0, Q5 + + SQR_SHIFT_LEFT 4, Q0, Q5 + SQR_SHIFT_LEFT 5, Q0, Q5 + SQR_SHIFT_LEFT 6, Q0, Q5 + SQR_SHIFT_LEFT 7, Q0, Q5 + + SQR_SHIFT_LEFT 8, Q0, Q5 + SQR_SHIFT_LEFT 9, Q0, Q5 + SQR_SHIFT_LEFT 10, Q0, Q5 + SQR_SHIFT_LEFT 11, Q0, Q5 + + SQR_SHIFT_LEFT 12, Q0, Q5 + SQR_SHIFT_LEFT 13, Q0, Q5 + SQR_SHIFT_LEFT 14, Q0, Q5 + SQR_SHIFT_LEFT 15, Q0, Q5 + + SQR_SHIFT_LEFT 16, Q0, Q5 + SQR_SHIFT_LEFT 17, Q0, Q5 + SQR_SHIFT_LEFT 18, Q0, Q5 + SQR_SHIFT_LEFT 19, Q0, Q5 + + SQR_SHIFT_LEFT 20, Q0, Q5 + SQR_SHIFT_LEFT 21, Q0, Q5 + SQR_SHIFT_LEFT 22, Q0, Q5 + SQR_SHIFT_LEFT 23, Q0, Q5 + + SQR_SHIFT_LEFT 24, Q0, Q5 + SQR_SHIFT_LEFT 25, Q0, Q5 + SQR_SHIFT_LEFT 26, Q0, Q5 + SQR_SHIFT_LEFT 27, Q0, Q5 + + SQR_SHIFT_LEFT 28, Q0, Q5 + SQR_SHIFT_LEFT 29, Q0, Q5 + SQR_SHIFT_LEFT 30, Q0, Q5 + SQR_SHIFT_LEFT 31, Q0, Q5 + + ////////////////////////////////////////////////////////////////////////////// + // Third Pass - Adding the squares on the even columns and propagating the sum + ////////////////////////////////////////////////////////////////////////////// + + mov Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)] // Q1 = pSrc + xor Q7, Q7 + + SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q7 + + SQR_DIAGONAL_PROP 8, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 9, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 10, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 11, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 12, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 13, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 14, Q1, Q3, Q0, QH, Q7 + SQR_DIAGONAL_PROP 15, Q1, Q3, Q0, QH, Q7 + +MUL_FUNCTION_END(SymCryptFdefRawSquare1024Asm) + +//VOID +//SymCryptFdefMontgomeryReduce1024Asm( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce1024Asm, 3, 13) + + mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits + mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + mov D12, D4 // outer loop counter + shl D12, 3 // D12 is in words + + xor D9, D9 + + // General register allocations + // Q0 = multiply result + // QH = multiply result + // Q1 = pointer to modulus value + // Q2 = pSrc (updated in outer loop) + // Q3 = pDst + // D4 = nDigits + // Q5 = pmMod->tm.montgomery.inv64 + // Q6 = multiplier in inner loop + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q9 = carry out from last word of previous loop iteration + // Q10 = running pointer in Src + // Q11 = running pointer in Mod + // D12 = loop counter + +ALIGN(16) + +SymCryptFdefMontgomeryReduce1024AsmOuterLoop: + + // start decoder with a few simple instructions, including at least one that requires + // a uop execution and is on the critical path + + mov Q6, [Q2] // fetch word of Src we want to set to zero + mov Q11, Q2 + mov Q10, Q1 + + imul Q6, Q5 // lower word is same for signed & unsigned multiply + + xor D7, D7 + + // Q0 = mul scratch + // QH = mul scratch + // Q1 = pointer to modulus value + // Q6 = multiplier + // Q7 = carry for even words (64 bits) + // Q8 = carry for odd words (64 bits) + // Q10 = running ptr to modulus + // Q11 = running ptr to input/scratch + // D12 = outer loop counter (words) + + MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8 + + MULT_DOUBLEADD_128 8, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 10, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 12, Q10, Q11, Q0, QH, Q6, Q7, Q8 + MULT_DOUBLEADD_128 14, Q10, Q11, Q0, QH, Q6, Q7, Q8 + + lea Q11,[Q11 + 128] + + add Q7, Q9 + mov D9, 0 + adc Q9, 0 + add Q7, [Q11] + adc Q9, 0 + mov [Q11], Q7 + + lea Q2,[Q2 + 8] + + dec D12 + jnz SymCryptFdefMontgomeryReduce1024AsmOuterLoop + + // + // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result + // + + // First we compute the pSrc result minus the modulus into the destination + mov D12, D4 // loop ctr + mov Q11, Q2 // pSrc + mov Q10, Q1 // pMod + mov Q7, Q3 // pDst + + // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2 + +ALIGN(16) + +SymCryptFdefMontgomeryReduce1024AsmSubLoop: + mov Q0,[Q11] + sbb Q0,[Q10] + mov [Q7], Q0 + + mov Q0,[Q11 + 8] + sbb Q0,[Q10 + 8] + mov [Q7 + 8], Q0 + + mov Q0,[Q11 + 16] + sbb Q0,[Q10 + 16] + mov [Q7 + 16], Q0 + + mov Q0,[Q11 + 24] + sbb Q0,[Q10 + 24] + mov [Q7 + 24], Q0 + + mov Q0,[Q11 + 32] + sbb Q0,[Q10 + 32] + mov [Q7 + 32], Q0 + + mov Q0,[Q11 + 40] + sbb Q0,[Q10 + 40] + mov [Q7 + 40], Q0 + + mov Q0,[Q11 + 48] + sbb Q0,[Q10 + 48] + mov [Q7 + 48], Q0 + + mov Q0,[Q11 + 56] + sbb Q0,[Q10 + 56] + mov [Q7 + 56], Q0 + + lea Q11,[Q11 + 64] + lea Q10,[Q10 + 64] + lea Q7,[Q7 + 64] + + dec D12 + jnz SymCryptFdefMontgomeryReduce1024AsmSubLoop + + // Finally a masked copy form pSrc to pDst + // copy if: Q9 == 0 && Cy = 1 + sbb D9, 0 + + movd xmm0, D9 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + +ALIGN(16) + +SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop: + movdqa xmm2, [Q2] // xmm2 = pSrc[0] + movdqa xmm3, [Q3] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3], xmm2 + + movdqa xmm2, [Q2 + 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 16] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 16], xmm2 + + movdqa xmm2, [Q2 + 32] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 32] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 32], xmm2 + + movdqa xmm2, [Q2 + 48] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 48] // xmm3 = pDst[0] + pand xmm2, xmm0 // + pand xmm3, xmm1 // + por xmm2, xmm3 + movdqa [Q3 + 48], xmm2 + + // Move on to the next digit + lea Q2,[Q2 + 64] + lea Q3,[Q3 + 64] + + dec D4 + jnz SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce1024Asm) + +FILE_END() diff --git a/lib/amd64/fdef_mul_macros.asm b/lib/amd64/fdef_mul_macros.asm deleted file mode 100644 index 24ccd43..0000000 --- a/lib/amd64/fdef_mul_macros.asm +++ /dev/null @@ -1,224 +0,0 @@ -; -; Macros for the multiplication routines in amd64 -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -; General multiplication - -MULT_SINGLEADD_128 MACRO index, src_reg, dst_reg - ; rax = mul scratch - ; rbx = multiplier - ; rdx = mul scratch - ; src_reg = running ptr to input - ; dst_reg = running ptr to output/scratch - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - mov rax, [src_reg + 8*index] - mul rbx - mov r15, rdx - add rax, r12 - mov [dst_reg + 8*index], rax - adc r15, 0 - - mov rax, [src_reg + 8*(index+1)] - mul rbx - mov r12, rdx - add rax, r15 - mov [dst_reg + 8*(index+1)], rax - adc r12, 0 - - ENDM - -MULT_DOUBLEADD_128 MACRO index, src_reg, dst_reg - ; rax = mul scratch - ; rbx = multiplier - ; rdx = mul scratch - ; src_reg = running ptr to input - ; dst_reg = running ptr to output/scratch - ; r12 = carry for even words (64 bits) - ; r15 = carry for odd words (64 bits) - - mov rax, [src_reg + 8*index] - mul rbx - mov r15, rdx - add rax, [dst_reg + 8*index] - adc r15, 0 - add rax, r12 - mov [dst_reg + 8*index], rax - adc r15, 0 - - mov rax, [src_reg + 8*(index+1)] - mul rbx - mov r12, rdx - add rax, [dst_reg + 8*(index+1)] - adc r12, 0 - add rax, r15 - mov [dst_reg + 8*(index+1)], rax - adc r12, 0 - - ENDM - -; Squaring - -SQR_SINGLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry - ; rax = mul scratch - ; rbx = multiplier - ; rdx = mul scratch - ; src_reg = running ptr to input - ; dst_reg = running ptr to output/scratch - ; src_carry = input carry - ; dst_carry = output carry - - mov rax, [src_reg + 8*index] - mul rbx - mov dst_carry, rdx - add rax, src_carry - mov [dst_reg + 8*index], rax - adc dst_carry, 0 - - ENDM - -SQR_DOUBLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry - ; rax = mul scratch - ; rbx = multiplier - ; rdx = mul scratch - ; src_reg = running ptr to input - ; dst_reg = running ptr to output/scratch - ; src_carry = input carry - ; dst_carry = output carry - - mov rax, [src_reg + 8*index] - mul rbx - mov dst_carry, rdx - add rax, [dst_reg + 8*index] - adc dst_carry, 0 - add rax, src_carry - mov [dst_reg + 8*index], rax - adc dst_carry, 0 - - ENDM - -SQR_SHIFT_LEFT MACRO index - mov rax, [rdi + 8*index] - adc rax, rax ; Shift let and add the carry - mov [rdi + 8*index], rax - ENDM - -SQR_DIAGONAL_PROP MACRO index - ;;;;;;;;;;;;;;;;;;;;;;;; - ; Calculating the square - mov rax, [rsi + 8*index] ; mulword - mul rax ; m^2 - - ; Adding the square to the even column - add rax, [rdi + 16*index] - adc rdx, 0 - add rax, r12 - adc rdx, 0 - mov [rdi + 16*index], rax - - ; Propagating the sum to the next column - mov rax, rdx - xor rdx, rdx - - add rax, [rdi + 16*index + 8] - adc rdx, 0 - mov [rdi + 16*index + 8], rax - mov r12, rdx - ENDM - -; Size-specific macros -; A common prologue & epilogue between several functions allows jumping between them... - -MULT_COMMON_PROLOGUE MACRO - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - ENDM - -MULT_COMMON_EPILOGUE MACRO - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - ENDM - - -MUL14 MACRO Mult, pA, R0, R1, R2, R3, Cy - ; (R0, R1, R2, R3, rdx) = Mult * (A0..3) + (R0, R1, R2, R3) - ; Cy, rax = scratch - - mov rax, [pA] - mul Mult - add R0, rax - adc rdx, 0 - mov Cy, rdx - - mov rax, [pA + 8] - mul Mult - add R1, rax - adc rdx, 0 - add R1, Cy - adc rdx, 0 - mov Cy, rdx - - mov rax, [pA + 16] - mul Mult - add R2, rax - adc rdx, 0 - add R2, Cy - adc rdx, 0 - mov Cy, rdx - - mov rax, [pA + 24] - mul Mult - add R3, rax - adc rdx, 0 - add R3, Cy - adc rdx, 0 - - ENDM - -; Macros for size-specific squaring - -SQR_DOUBLEADD_64_2 MACRO index - SQR_DOUBLEADD_64 (index), rsi, rdi, r12, r15 - SQR_DOUBLEADD_64 (index + 1), rsi, rdi, r15, r12 - ENDM - -SQR_DOUBLEADD_64_4 MACRO index - SQR_DOUBLEADD_64_2 (index) - SQR_DOUBLEADD_64_2 (index + 2) - ENDM - -SQR_DOUBLEADD_64_8 MACRO index - SQR_DOUBLEADD_64_4 (index) - SQR_DOUBLEADD_64_4 (index + 4) - ENDM - -SQR_SIZE_SPECIFIC_INIT MACRO - lea rcx, [rcx + 8] ; move Src pointer 1 word over - lea r10, [r10 + 16] ; move Dst pointer 2 words over - - mov rsi, rcx ; rsi = inner pSrc - mov rdi, r10 ; rdi = inner pDst - - mov rbx, [rcx] ; Get the next mulword - lea rsi, [rsi + 8] ; move Src pointer 1 word over - ENDM \ No newline at end of file diff --git a/lib/amd64/fdef_mulx.asm b/lib/amd64/fdef_mulx.asm deleted file mode 100644 index 0165923..0000000 --- a/lib/amd64/fdef_mulx.asm +++ /dev/null @@ -1,1680 +0,0 @@ -; -; fdef_asm.asm Assembler code for large integer arithmetic in the default data format -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. - -include ksamd64.inc - -include symcrypt_version.inc -include symcrypt_magic.inc - - - -include C_asm_shared.inc - -; A digit consists of 8 words of 64 bits each - -;The MULX/ADCX/ADOX instructions greatly speed up multi-precision arithmetic. -;A set of MULX + ADCX + ADOX can implement a single 64x64->128 plus two 64-bit additions in a single clock cycle (throughput) -;However, that speed puts pressure on other parts of the system. -; -;The code size for these three instructions is 18 cycles, whereas the pre-decoder on Broadwell reportedly can only -; load 16 bytes per cycle. -;That means the pre-decoder need 9 cycles per 8 multiplications, plus one for the per-row-of-8 overhead, meaning we need 10 -;cycles for 8 multiplications. Except that I have measured the 18 bytes as taking 1 cycle each, so the decoder must have a -; higher bandwidth. -; -;If we keep the code size small enough to fit in the uop cache, then the pre-decoder bottleneck goes away which should save us -;8 cycles per 512x512 multiplication. -; -;Code size for 512x512 is 64 multiplications at 18 bytes each = 36 cache lines of 32 bytes which need 72 uop cache lines that -;each contain up to 6 uops. (Each 32-byte code cache line contains 7 or so uops, so the 6 uops per uop cache line isn't enough.) -;The total uop cache is 256 lines, so we could fit 3+ copies of the 512x512 code. -; -;But we need the following: -;- A core 512x512 multiplication in a loop -;- Either zero the 8 carry registers up front (3 cycles), or have a separate 512x512 multiplication that sets up the carry registers. -; This latter is less code, alleviating the decoder bottleneck a bit. -;- A 512x512 multiplication that computes the Montgovery multipliers in-line -;- Code for squaring using MULX/ADX. -; -;The 512x512 unrolling is really necessary to get the performance; using 256x256 adds more overhead that we could gain back from the -;uop cache, and it uses more computations and will in general be slower. -; -;The full modexp loop also contains things like masked copies, ScsTable, etc. -;All in all, I don't see how we can keep all this inside the uop cache. -;Therefore, we will ignore the uop cache and optimize the code without it. -; -;Basic bottlenecks: -;- Pre-decoder at 16 bytes/cycle (turns out to be more...) -;- Decoder which can decode 1-1-1-1, 2-1-1, 3-1 (although some sources claim it doesn't) and 4 per cycle -;- One source claims that mulx takes 2 uops, and mulx with memory argument 3 uops which would limit the decoder throughput to -; require 2 cycles per mulx(mem)/adox/adcx triplet. -; -; We have verified experimentally that on Broadwell, a sequence of 1024 triples of (MULX w/ memory operand, adox, adcx) runs -; at 1 cycle per triple. As this code is too large for the uop cache, the pre-decoders and decoders are fast enough. -; Adding a fourth instruction to the tuple makes it run at 2 cycles/tuple. -; This is consistent with: -; - Pre-decoder is able to process at least 18 bytes per cycle -; - Mulx is 1 uop, Mulx + memory read is 2 uops -; - Decoder can produce 4 uops per cycle. -; -;Basic multiplication operation: -; -; We have one set of macros that do 8 words times 1 word, leaving 8 words carry in registers -; 8 of these 8x1 multiplications in sequence forms an 8x8, which is the inner loop body -; (First iteration is slightly differently and done first outside the loop) -; The inner loop iterates this to get an 8n * 8 multiplication -; The outer loop iterates this to get an 8n * 8m multiplication -; -; Our bottleneck seems to be the pre-decoder which can only run 16 bytes of code each clock cycle. -; (The uop cache is too small to hold our square+multiply+montgomery reduction code.) -; Thus we don't use zero-output and then multiply-and-add, but rather have separate copies -; of the code for the first iteration to do multiply-without-add as that cuts down on the total amount of code -; we need, and with that reduces the pre-decoder usage. -; - -MULADD18 MACRO R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1 - ; R0:R[7:1]:D[0] = A[7:0] * B[0] + D[0] + R[7:0] - ; Pre: Cy = Ov = 0 - ; Post: Cy = Ov = 0 - - mov rdx, [pB] - adox R0, [pD] - - mulx T1, T0, [pA + 0 * 8] - adcx R0, T0 - adox R1, T1 - - mulx T1, T0, [pA + 1 * 8] - adcx R1, T0 - adox R2, T1 - - mulx T1, T0, [pA + 2 * 8] - adcx R2, T0 - adox R3, T1 - - mulx T1, T0, [pA + 3 * 8] - adcx R3, T0 - adox R4, T1 - - mulx T1, T0, [pA + 4 * 8] - adcx R4, T0 - adox R5, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R5, T0 - adox R6, T1 - - mulx T1, T0, [pA + 6 * 8] - adcx R6, T0 - adox R7, T1 - - mulx T1, T0, [pA + 7 * 8] - adcx R7, T0 - mov [pD], R0 - - mov R0, 0 - adcx R0, R0 - adox R0, T1 - - ENDM ; MULADD18 - - -MULADD88 MACRO R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1 - ; pre & post: Cy = Ov = 0 - ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] - ; rdx is volatile - - MULADD18 R0, R1, R2, R3, R4, R5, R6, R7, pD , pA, pB , T0, T1 - MULADD18 R1, R2, R3, R4, R5, R6, R7, R0, pD + 8, pA, pB + 8, T0, T1 - MULADD18 R2, R3, R4, R5, R6, R7, R0, R1, pD + 16, pA, pB + 16, T0, T1 - MULADD18 R3, R4, R5, R6, R7, R0, R1, R2, pD + 24, pA, pB + 24, T0, T1 - MULADD18 R4, R5, R6, R7, R0, R1, R2, R3, pD + 32, pA, pB + 32, T0, T1 - MULADD18 R5, R6, R7, R0, R1, R2, R3, R4, pD + 40, pA, pB + 40, T0, T1 - MULADD18 R6, R7, R0, R1, R2, R3, R4, R5, pD + 48, pA, pB + 48, T0, T1 - MULADD18 R7, R0, R1, R2, R3, R4, R5, R6, pD + 56, pA, pB + 56, T0, T1 - - ENDM ;MULADD88 - -HALF_SQUARE_NODIAG8 MACRO R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, T0, T1 - ; pre & post: Cy = Ov = 0 - ; R[7-0]:D[7-0] = D[7:0] + (A[0:7]^2 - \sum_{i=0}^7 (A[i] * 2^{64*i}) )/2 - ; This is the component of the square that needs to be doubled, and then the diagonals added - ; rdx is volatile - - ; Note that Dst[0] is not changed by this macro - - mov rdx, [pA + 0 * 8] ; rdx = A0 - mov R1, [pD + 1 * 8] - mov R2, [pD + 2 * 8] - mov R3, [pD + 3 * 8] - mov R4, [pD + 4 * 8] - mov R5, [pD + 5 * 8] - mov R6, [pD + 6 * 8] - mov R7, [pD + 7 * 8] - xor R0, R0 - - mulx T1, T0, [pA + 1 * 8] - adcx R1, T0 - adox R2, T1 - - mulx T1, T0, [pA + 2 * 8] - adcx R2, T0 - adox R3, T1 - - mulx T1, T0, [pA + 3 * 8] - adcx R3, T0 - adox R4, T1 - - mulx T1, T0, [pA + 4 * 8] - adcx R4, T0 - adox R5, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R5, T0 - adox R6, T1 - - mulx T1, T0, [pA + 6 * 8] - adcx R6, T0 - adox R7, T1 - - mulx T1, T0, [pA + 7 * 8] - adcx R7, T0 - mov [pD + 1 * 8], R1 - - adcx R0, R0 - adox R0, T1 - mov [pD + 2 * 8], R2 - mov rdx, [pA + 1 * 8] ; rdx = A1 - - ;======= - - mulx T1, T0, [pA + 2 * 8] - adcx R3, T0 - adox R4, T1 - - mulx T1, T0, [pA + 3 * 8] - adcx R4, T0 - adox R5, T1 - - mulx T1, T0, [pA + 4 * 8] - adcx R5, T0 - adox R6, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R6, T0 - adox R7, T1 - - mulx T1, T0, [pA + 6 * 8] - adcx R7, T0 - adox R0, T1 - - mov rdx, [pA + 7 * 8] ; rdx = A7 - mov R1, 0 - mov R2, 0 - mov [pD + 3 * 8], R3 - - mulx T1, T0, [pA + 1 * 8] - adcx R0, T0 - adox R1, T1 ; doesn't produce Ov as T1 <= 0xff..fe and R1=0 - - mulx T1, T0, [pA + 2 * 8] - adcx R1, T0 - mov [pD + 4 * 8], R4 - - adcx R2, T1 - mov rdx, [pA + 2 * 8] ;rdx = A2 - - ;====== - - mulx T1, T0, [pA + 3 * 8] - adcx R5, T0 - adox R6, T1 - - mulx T1, T0, [pA + 4 * 8] - adcx R6, T0 - adox R7, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R7, T0 - adox R0, T1 - - mulx T1, T0, [pA + 6 * 8] - adcx R0, T0 - adox R1, T1 - - mov rdx, [pA + 4 * 8] ; rdx = A4 - mov R3, 0 - mov R4, 0 - - mulx T1, T0, [pA + 5 * 8] - adcx R1, T0 - adox R2, T1 - - mulx T1,T0, [pA + 6 * 8] - adcx R2, T0 - adox R3, T1 ; doesn't produce Ov as T1 <= 0xff..fe and R3=0 - - mov rdx, [pA + 5 * 8] ;rdx = A5 - mov [pD + 5 * 8], R5 - - mulx T1, T0, [pA + 6 * 8] - adcx R3, T0 - adcx R4, T1 - - mov rdx, [pA + 3 * 8] ;rdx = A3 - mov [pD + 6 * 8], R6 - - ;====== - - mulx T1, T0, [pA + 4 * 8] - adcx R7, T0 - adox R0, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R0, T0 - adox R1, T1 - - mulx T1, T0, [pA + 6 * 8] - adcx R1, T0 - adox R2, T1 - - mulx T1, T0, [pA + 7 * 8] - adcx R2, T0 - adox R3, T1 - - mov rdx, [pA + 7 * 8] ;rdx = A7 - mov R5, 0 - mov R6, 0 - mov [pD + 7 * 8], R7 - - mulx T1, T0, [pA + 4 * 8] - adcx R3, T0 - adox R4, T1 - - mulx T1, T0, [pA + 5 * 8] - adcx R4, T0 - adox R5, T1 ; doesn't produce Ov as T1 <= 0xff..fe and R5=0 - - mulx T1, T0, [pA + 6 * 8] - adcx R5, T0 - adcx R6, T1 - - xor R7, R7 - - ENDM - -MONTGOMERY18 MACRO R0, R1, R2, R3, R4, R5, R6, R7, modInv, pMod, pMont, T0, T1 - ; Mont[0] = (modinv * R0 mod 2^64) - ; R0:R[7:1]: = Mont[0] * Mod[7:0] + R[7:0] - ; Pre: - - ; Post: - - mov rdx, R0 - imul rdx, modInv - - mov [pMont], rdx - - xor T0, T0 ; Reset Cy = Ov = 0 - - mulx T1, T0, [pMod + 0 * 8] - adcx R0, T0 ; R0 = 0 here, but it produces a carry unless R0=0 at the start - adox R1, T1 - - mulx T1, T0, [pMod + 1 * 8] - adcx R1, T0 - adox R2, T1 - - mulx T1, T0, [pMod + 2 * 8] - adcx R2, T0 - adox R3, T1 - - mulx T1, T0, [pMod + 3 * 8] - adcx R3, T0 - adox R4, T1 - - mulx T1, T0, [pMod + 4 * 8] - adcx R4, T0 - adox R5, T1 - - mulx T1, T0, [pMod + 5 * 8] - adcx R5, T0 - adox R6, T1 - - mulx T1, T0, [pMod + 6 * 8] - adcx R6, T0 - adox R7, T1 - - mulx T1, T0, [pMod + 7 * 8] - adcx R7, T0 - - ; R0 = 0 here due to our modinv invariant... - - adcx R0, R0 - adox R0, T1 - - ENDM - -ZEROREG MACRO R - xor R,R - ENDM - -ZEROREG_8 MACRO R0, R1, R2, R3, R4, R5, R6, R7 - ZEROREG R0 - ZEROREG R1 - ZEROREG R2 - ZEROREG R3 - ZEROREG R4 - ZEROREG R5 - ZEROREG R6 - ZEROREG R7 - ENDM - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; UINT32 nDigits1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits2, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) - -SymCryptFdefRawMulMulx_Frame struct - SavedRbp dq ? - SavedRbx dq ? - SavedRsi dq ? - SavedRdi dq ? - SavedR15 dq ? - SavedR14 dq ? - SavedR13 dq ? - SavedR12 dq ? - returnaddress dq ? - pSrc1Home dq ? - nDigits1Home dq ? - pSrc2Home dq ? - nDigits2Home dq ? - pDst dq ? - -SymCryptFdefRawMulMulx_Frame ends - - NESTED_ENTRY SymCryptFdefRawMulMulx, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - mov [rsp + SymCryptFdefRawMulMulx_Frame.pSrc1Home], rcx - mov [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home], rdx - mov [rsp + SymCryptFdefRawMulMulx_Frame.pSrc2Home], r8 - mov [rsp + SymCryptFdefRawMulMulx_Frame.nDigits2Home], r9 - - ; rcx = pSrc1 - ; rdx = nDigits1 - ; r8 = pSrc2 - ; r9 = nDigits2 - ; pDst on stack - - ; pSrc1/Digits1 = outer loop - ; pSrc2/Digits2 = inner loop - - ; First we wipe nDigits2 of the result (size of in) - mov rbx,[rsp + SymCryptFdefRawMulMulx_Frame.pDst] - mov rdi, rbx - - ; Wipe destination for nDigit2 blocks - xorps xmm0,xmm0 ; Zero register for 16-byte wipes - mov rax, r9 - -SymCryptFdefRawMulMulxWipeLoop: - movaps [rbx],xmm0 - movaps [rbx+16],xmm0 ; Wipe 32 bytes - movaps [rbx+32],xmm0 ; Wipe 32 bytes - movaps [rbx+48],xmm0 ; Wipe 32 bytes - add rbx, 64 - sub rax, 1 - jnz SymCryptFdefRawMulMulxWipeLoop - - -SymCryptFdefRawMulxOuterLoop: - - ZEROREG_8 rsi, rbp, r10, r11, r12, r13, r14, r15 ; Leaves Cy = Ov = 0 - -SymCryptFdefRawMulMulxInnerLoop: - - ; Register allocation in loops: - ; rsi, rbp, r10, r11, r12, r13, r14, r15 8-word carry - ; rax, rbx temps for multiplication - ; rcx, r8 pSrc1, pSrc2 running pointers - ; r9 inner loop counter - ; rdx fixed input reg for multiplication - ; rdi Destination running pointer inner loop - ; rsp[pDst] Destination running pointer outer loop - ; rsp[nDigits1] outer loop counter - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx - - add r8, 64 ; Src2 ptr - add rdi, 64 - - sub r9d, 1 ; sets Cy = Ov = 0 because r9 < 2^32 / 64 - jnz SymCryptFdefRawMulMulxInnerLoop - - ; Write the 8-word carry-out to the destination - mov [rdi + 0*8], rsi - mov [rdi + 1*8], rbp - mov [rdi + 2*8], r10 - mov [rdi + 3*8], r11 - mov [rdi + 4*8], r12 - mov [rdi + 5*8], r13 - mov [rdi + 6*8], r14 - mov [rdi + 7*8], r15 - - ; set up for next iteration - ; reset rdi & increment - mov rdi, [rsp + SymCryptFdefRawMulMulx_Frame.pDst] - add rdi, 64 - mov [rsp + SymCryptFdefRawMulMulx_Frame.pDst], rdi - - ; reload pSrc2/nDigits2 - mov r9, [rsp + SymCryptFdefRawMulMulx_Frame.nDigits2Home] - mov r8, [rsp + SymCryptFdefRawMulMulx_Frame.pSrc2Home] - - ; update PSrc1 - add rcx, 64 - - ; nDigits1 loop counter - mov rax, [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home] - sub rax, 1 ; leaves Cy = Ov = 0 because nDigits1 < 2^32 / 64 - mov [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home], rax - - jnz SymCryptFdefRawMulxOuterLoop - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefRawMulMulx, _TEXT - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquare( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) - -SymCryptFdefRawSquareMulx_Frame struct - - SavedRbp dq ? - SavedRbx dq ? - SavedRsi dq ? - SavedRdi dq ? - SavedR15 dq ? - SavedR14 dq ? - SavedR13 dq ? - SavedR12 dq ? - returnaddress dq ? - pSrcHome dq ? - - ; Two 32-bit local variables, in the space of one normal 64-bit stack slot - nDigitsHome dd ? ; 32 bits, original argument to function - nextNDigits dd ? ; 32 bits; number of digits to do in the next sequence of inner loops. - - pDstHome dq ? - pDstPtr dq ? ; pDst running pointer outer loop (This is the 4th argument stack slot which is always available.) - -SymCryptFdefRawSquareMulx_Frame ends - - NESTED_ENTRY SymCryptFdefRawSquareMulx, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - ; rcx = pSrc - ; rdx = nDigits - ; r8 = pDst - - ; Save parameters for phase 2 - mov [rsp + SymCryptFdefRawSquareMulx_Frame.pSrcHome], rcx - mov [rsp + SymCryptFdefRawSquareMulx_Frame.nDigitsHome], edx - mov [rsp + SymCryptFdefRawSquareMulx_Frame.pDstHome], r8 - - ; Initialize our local variables - mov [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits], edx - mov [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr], r8 - - mov r9d, edx ; rdx is used in the multiplications... - - ; Wipe destination for nDigits blocks - - xor rax, rax - mov rbx, r8 - ; we'll use the edx digit counter destructively... - -SymCryptFdefRawSquareMulxWipeLoop: - ; we use 8-byte writes as we will be reading this very soon in 8-byte chunks, and this way the store-load - ; forwarding works - mov [rbx ], rax - mov [rbx + 8], rax - mov [rbx + 16], rax - mov [rbx + 24], rax - mov [rbx + 32], rax - mov [rbx + 40], rax - mov [rbx + 48], rax - mov [rbx + 56], rax - add rbx, 64 - sub edx, 1 - jnz SymCryptFdefRawSquareMulxWipeLoop - - ; Cy = Ov = 0 here because the last 'sub edx,1' yielded 0 - -SymCryptFdefRawSquareMulxOuterLoop: - - HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rax, rbx - - sub r9d, 1 - jz SymCryptFdefRawSquareMulxPhase2 ; end of phase 1 - - lea rdi, [rcx + 64] - lea r8, [r8 + 64] - -SymCryptFdefRawSquareMulxInnerLoop: - ; rsi, rbp, r10, r11, r12, r13, r14, r15 8-word carry - ; rax, rbx temps for multiplication - ; rcx pSrc running pointer outer loop - ; r8 pDst running pointer inner loop - ; r9d inner loop nDigit counter - ; rdx fixed input reg for multiplication - ; rdi pSrc running pointer inner loop - - ; rsp[pSrc] pSrc (used for final pass) - ; rsp[nDigits] nDigits (used for final pass) - ; rsp[pDst] pDst (used for final pass) - ; rsp[nextNDigits] # inner loop blocks in next outer loop iteration - ; rsp[pDstPtr] pDst running pointer outer loop - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx - - add r8, 64 - add rdi, 64 - - sub r9d, 1 ; Sets Cy = Ov = 0 because r9d < 2^32 / bits_per_digit - jnz SymCryptFdefRawSquareMulxInnerLoop - - ; Write the 8-word carry-out to the destination - mov [r8 + 0*8], rsi - mov [r8 + 1*8], rbp - mov [r8 + 2*8], r10 - mov [r8 + 3*8], r11 - mov [r8 + 4*8], r12 - mov [r8 + 5*8], r13 - mov [r8 + 6*8], r14 - mov [r8 + 7*8], r15 - - add rcx, 64 - - mov r8, [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr] - add r8, 128 ; Shift output ptr by 2 digits - mov [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr], r8 - - mov r9d, [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits] - sub r9d, 1 - mov [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits], r9d - - jmp SymCryptFdefRawSquareMulxOuterLoop - - -SymCryptFdefRawSquareMulxPhase2: - ; Cy = Ov = 0 because last 'sub r9d, 1' resulted in 0 - - ; Write the 8-word carry-out to the destination - mov [r8 + 8*8], rsi - mov [r8 + 9*8], rbp - mov [r8 + 10*8], r10 - mov [r8 + 11*8], r11 - mov [r8 + 12*8], r12 - mov [r8 + 13*8], r13 - mov [r8 + 14*8], r14 - mov [r8 + 15*8], r15 - - ; Compute diagonals, and add double the result so far - - mov rcx, [rsp + SymCryptFdefRawSquareMulx_Frame.pSrcHome] - mov r9d, [rsp + SymCryptFdefRawSquareMulx_Frame.nDigitsHome] - mov r8, [rsp + SymCryptFdefRawSquareMulx_Frame.pDstHome] - - ; We can't keep the carries in Cy and Ov because there is no way to do a loop counter - ; without touching the Ov flag. - ; So we set the Ov carry in rsi, and retain a zero in rdi - xor esi, esi - xor edi, edi - -SymCryptFdefRawSquareMulxDiagonalsLoop: - ; Cy = carry in - ; esi = carry in (1 bit) - ; Ov = 0 - -SYMCRYPT_SQUARE_DIAG MACRO index - mov rdx, [rcx + 8 * index] - mov r10, [r8 + 16 * index] - mov r11, [r8 + 16 * index + 8] - mulx rbx, rax, rdx - adcx rax, r10 - adox rax, r10 - adcx rbx, r11 - adox rbx, r11 - mov [r8 + 16 * index], rax - mov [r8 + 16 * index + 8], rbx - ENDM - - ; First word is different to handle the carry - ; SYMCRYPT_SQUARE_DIAG 0 - mov rdx, [rcx] - mov r10, [r8] - mov r11, [r8 + 8] - mulx rbx, rax, rdx - adcx rax, rsi ; add both carries - adcx rbx, rdi ; rdi = 0; now Cy = 0 because result of multiply <= ff..fe00..01 - - adcx rax, r10 - adox rax, r10 - adcx rbx, r11 - adox rbx, r11 - mov [r8 ], rax - mov [r8 + 8], rbx - - SYMCRYPT_SQUARE_DIAG 1 - SYMCRYPT_SQUARE_DIAG 2 - SYMCRYPT_SQUARE_DIAG 3 - SYMCRYPT_SQUARE_DIAG 4 - SYMCRYPT_SQUARE_DIAG 5 - SYMCRYPT_SQUARE_DIAG 6 - SYMCRYPT_SQUARE_DIAG 7 - - ; Move the Ov flag into esi - mov esi, edi - adox esi, edi - - ; There is no way to do a loop counter without overwriting the Ov flag - ; Even the 'dec' instruction touches it, and LAHF/SAHF doesn't load/store the Ov flag. - ; We can't push/pop efl in a function body - - lea rcx, [rcx + 64] - lea r8, [r8 + 128] - dec r9d - jnz SymCryptFdefRawSquareMulxDiagonalsLoop - - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefRawSquareMulx, _TEXT - - - - - -;VOID -;SymCryptFdefMontgomeryReduce( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - -SymCryptFdefMontgomeryReduceMulx_Frame struct - - SavedRbp dq ? - SavedRbx dq ? - SavedRsi dq ? - SavedRdi dq ? - SavedR15 dq ? - SavedR14 dq ? - SavedR13 dq ? - SavedR12 dq ? - returnaddress dq ? - - pmModHome dq ? - pSrcHome dq ? - pDstHome dq ? - - ; two 4-byte variables in P4Home - CntOuter dd ? ; outer loop counter - HighCarry dd ? - -SymCryptFdefMontgomeryReduceMulx_Frame ends - - - NESTED_ENTRY SymCryptFdefMontgomeryReduceMulx, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - ; rcx = pmMod - ; rdx = pSrc = scratch buffer - ; r8 = pDst - - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome], rcx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], rdx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome], r8 - - mov r8, rdx - - mov eax, [rcx + SymCryptModulusNdigitsOffsetAmd64] - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax - ; CntOuter = nDigits - - xor ebx, ebx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], ebx - ; HighCarry = 0 - -SymCryptFdefMontgomeryReduceMulxOuterLoop: - ; rcx = pmMod - ; r8 = pSrc = tmp buffer that we will reduce - mov rsi, [r8 + 0 * 8] - mov rbp, [r8 + 1 * 8] - mov r10, [r8 + 2 * 8] - mov r11, [r8 + 3 * 8] - mov r12, [r8 + 4 * 8] - mov r13, [r8 + 5 * 8] - mov r14, [r8 + 6 * 8] - mov r15, [r8 + 7 * 8] - - mov rdi, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - mov r9d, [rcx + SymCryptModulusNdigitsOffsetAmd64] - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - ; r8 = value to reduce - ; rsi - r15= r8[0..7] - ; rcx = modulus value - ; rdi = modinv - - MONTGOMERY18 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8 + 0 * 8, rax, rbx - MONTGOMERY18 rbp, r10, r11, r12, r13, r14, r15, rsi, rdi, rcx, r8 + 1 * 8, rax, rbx - MONTGOMERY18 r10, r11, r12, r13, r14, r15, rsi, rbp, rdi, rcx, r8 + 2 * 8, rax, rbx - MONTGOMERY18 r11, r12, r13, r14, r15, rsi, rbp, r10, rdi, rcx, r8 + 3 * 8, rax, rbx - MONTGOMERY18 r12, r13, r14, r15, rsi, rbp, r10, r11, rdi, rcx, r8 + 4 * 8, rax, rbx - MONTGOMERY18 r13, r14, r15, rsi, rbp, r10, r11, r12, rdi, rcx, r8 + 5 * 8, rax, rbx - MONTGOMERY18 r14, r15, rsi, rbp, r10, r11, r12, r13, rdi, rcx, r8 + 6 * 8, rax, rbx - MONTGOMERY18 r15, rsi, rbp, r10, r11, r12, r13, r14, rdi, rcx, r8 + 7 * 8, rax, rbx - - ; rsi .. r15 = carry from multiply-add - ; r8[0..7] = Montgomery factors - - mov rdi, r8 ; factor to multiply by - add rcx, 64 - add r8, 64 - - sub r9d, 1 - jz SymCryptFdefMontgomeryReduceMulxInnerLoopDone - -SymCryptFdefMontgomeryReduceMulxInnerLoop: - - ; rsi, rbp, r10, r11, r12, r13, r14, r15 8-word carry - ; rax, rbx temps for multiplication - ; rcx running pointer pMod inner loop - ; r8 running pointer pSrc inner loop - ; rdi Montgomery factors for this row - ; r9 loop ctr - ; rdx fixed input reg for multiplication - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx - ; pre & post: Cy = Ov = 0 - ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] - ; rdx is volatile - - add rcx, 64 - add r8, 64 - sub r9d, 1 - jnz SymCryptFdefMontgomeryReduceMulxInnerLoop - - -SymCryptFdefMontgomeryReduceMulxInnerLoopDone: - - ; We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry - ; We also saved a 1-bit carry from the previous outer loop - xor edx, edx - mov eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry] - ; move carry into Cy flag - neg eax - - ; We do this in separate instructions to help the instruction decoder build up a lead... - mov rax, [r8 + 0 * 8] - adc rax, rsi - mov [r8 + 0 * 8], rax - - mov rbx, [r8 + 1 * 8] - adc rbx, rbp - mov [r8 + 1 * 8], rbx - - mov rax, [r8 + 2 * 8] - adc rax, r10 - mov [r8 + 2 * 8], rax - - mov rbx, [r8 + 3 * 8] - adc rbx, r11 - mov [r8 + 3 * 8], rbx - - mov rax, [r8 + 4 * 8] - adc rax, r12 - mov [r8 + 4 * 8], rax - - mov rbx, [r8 + 5 * 8] - adc rbx, r13 - mov [r8 + 5 * 8], rbx - - mov rax, [r8 + 6 * 8] - adc rax, r14 - mov [r8 + 6 * 8], rax - - mov rbx, [r8 + 7 * 8] - adc rbx, r15 - mov [r8 + 7 * 8], rbx - - adc edx, edx ; edx = carry - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], edx - - mov r8, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome] - add r8, 64 - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], r8 - - mov rcx, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome] - - mov eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter] - sub eax, 1 - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax - - jnz SymCryptFdefMontgomeryReduceMulxOuterloop - - ; edx = output carry - - mov esi, [rcx + SymCryptModulusNdigitsOffsetAmd64] - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - mov rdi, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome] - - ; r8 = result buffer pointer - ; esi = # digits - ; rcx = modulus value - ; rdi = Dst - - ; copy these values for the maked copy loop - mov r9d, esi ; nDigits - mov r10, r8 ; result buffer - mov rbp, rdi ; destination pointer - - ; pDst = Reduction result - Modulus - -SymCryptFdefMontgomeryReduceMulxSubLoop: - mov rax,[r8 + 0 * 8] - sbb rax,[rcx + 0 * 8] - mov [rdi + 0 * 8], rax - - mov rbx,[r8 + 1 * 8] - sbb rbx,[rcx + 1 * 8] - mov [rdi + 1 * 8], rbx - - mov rax,[r8 + 2 * 8] - sbb rax,[rcx + 2 * 8] - mov [rdi + 2 * 8], rax - - mov rbx,[r8 + 3 * 8] - sbb rbx,[rcx + 3 * 8] - mov [rdi + 3 * 8], rbx - - mov rax,[r8 + 4 * 8] - sbb rax,[rcx + 4 * 8] - mov [rdi + 4 * 8], rax - - mov rbx,[r8 + 5 * 8] - sbb rbx,[rcx + 5 * 8] - mov [rdi + 5 * 8], rbx - - mov rax,[r8 + 6 * 8] - sbb rax,[rcx + 6 * 8] - mov [rdi + 6 * 8], rax - - mov rbx,[r8 + 7 * 8] - sbb rbx,[rcx + 7 * 8] - mov [rdi + 7 * 8], rbx - - lea r8, [r8 + 64] - lea rcx, [rcx + 64] - lea rdi, [rdi + 64] - dec esi - jnz SymCryptFdefMontgomeryReduceMulxSubLoop - - ; now a masked copy from the reduction buffer to the destination. - ; copy if high carry = 0 and Cy = 1 - sbb edx, 0 - ; edx = copy mask, ff...ff if copy, 0 of no copy - - movd xmm0, edx ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - -SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop: - movdqa xmm2, [r10 + 0 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 0 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 0 * 16], xmm2 - - movdqa xmm2, [r10 + 1 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 1 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 1 * 16], xmm2 - - movdqa xmm2, [r10 + 2 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 2 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 2 * 16], xmm2 - - movdqa xmm2, [r10 + 3 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 3 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 3 * 16], xmm2 - - ; Move on to the next digit - - add r10, 64 - add rbp, 64 - sub r9d, 1 - jnz SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefMontgomeryReduceMulx, _TEXT - -; -------------------------------- -; 1024-bit size specific functions -; -------------------------------- - -;VOID -;SYMCRYPT_CALL -;SymCryptFdefRawMul( -; _In_reads_(nWords1) PCUINT32 pSrc1, -; _In_reads_(nWords2) PCUINT32 pSrc2, -; UINT32 nDigits, -; _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefRawMulMulx1024, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - ; First we wipe nDigits2 of the result (size of in) - mov rbx, r9 - mov rdi, r9 - - mov r9, r8 - mov r8, rdx - - ; rcx = pSrc1 - ; r8 = pSrc2 - ; r9 = nDigits - - ; Wipe destination for nDigit2 blocks - xorps xmm0,xmm0 ; Zero register for 16-byte wipes - - movaps [rbx],xmm0 - movaps [rbx+16],xmm0 ; Wipe 32 bytes - movaps [rbx+32],xmm0 ; Wipe 32 bytes - movaps [rbx+48],xmm0 ; Wipe 32 bytes - - movaps [rbx+64],xmm0 - movaps [rbx+80],xmm0 ; Wipe 32 bytes - movaps [rbx+96],xmm0 ; Wipe 32 bytes - movaps [rbx+112],xmm0 ; Wipe 32 bytes - - ; Digit 1 from src2 - - ZEROREG_8 rsi, rbp, r10, r11, r12, r13, r14, r15 ; Leaves Cy = Ov = 0 - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx - - add r8, 64 ; Src2 ptr - add rdi, 64 - xor rax, rax ; sets Cy = Ov = 0 - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx - - add rdi, 64 - - ; Write the 8-word carry-out to the destination - mov [rdi + 0*8], rsi - mov [rdi + 1*8], rbp - mov [rdi + 2*8], r10 - mov [rdi + 3*8], r11 - mov [rdi + 4*8], r12 - mov [rdi + 5*8], r13 - mov [rdi + 6*8], r14 - mov [rdi + 7*8], r15 - - ; Digit 2 from src2 - - ; set up - - ; Mov rdi one digit back - sub rdi, 64 - - ; reload pSrc2 - sub r8, 64 - - ; update PSrc1 - add rcx, 64 - - ZEROREG_8 rsi, rbp, r10, r11, r12, r13, r14, r15 ; Leaves Cy = Ov = 0 - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx - - add r8, 64 ; Src2 ptr - add rdi, 64 - xor rax, rax ; sets Cy = Ov = 0 - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx - - add rdi, 64 - - ; Write the 8-word carry-out to the destination - mov [rdi + 0*8], rsi - mov [rdi + 1*8], rbp - mov [rdi + 2*8], r10 - mov [rdi + 3*8], r11 - mov [rdi + 4*8], r12 - mov [rdi + 5*8], r13 - mov [rdi + 6*8], r14 - mov [rdi + 7*8], r15 - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefRawMulMulx1024, _TEXT - -; VOID -; SYMCRYPT_CALL -; SymCryptFdefRawSquare( -; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, -; UINT32 nDigits, -; _Out_writes_(2*nWords) PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefRawSquareMulx1024, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - ; rcx = pSrc - ; rdx = nDigits // (ignored) - ; r8 = pDst - - ; Save parameters for phase 2 - mov r9, r8 ; pDst - - ; Wipe destination for nDigits blocks - - xor rax, rax - mov rbx, r8 - ; we'll use the edx digit counter destructively... - - ; Wipe destination - xorps xmm0,xmm0 ; Zero register for 16-byte wipes - - movaps [rbx],xmm0 - movaps [rbx+16],xmm0 ; Wipe 32 bytes - movaps [rbx+32],xmm0 ; Wipe 32 bytes - movaps [rbx+48],xmm0 ; Wipe 32 bytes - - movaps [rbx+64],xmm0 - movaps [rbx+80],xmm0 ; Wipe 32 bytes - movaps [rbx+96],xmm0 ; Wipe 32 bytes - movaps [rbx+112],xmm0 ; Wipe 32 bytes - - ; Cy = Ov = 0 here - - HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rax, rbx - - lea rdi, [rcx + 64] - lea r8, [r8 + 64] - - ; rsi, rbp, r10, r11, r12, r13, r14, r15 8-word carry - ; rax, rbx temps for multiplication - ; rcx pSrc running pointer outer loop - ; r8 pDst running pointer inner loop - ; rdx fixed input reg for multiplication - ; rdi pSrc running pointer inner loop - - ; rsp[pSrc] pSrc (used for final pass) - ; rsp[nDigits] nDigits (used for final pass) - ; rsp[pDst] pDst (used for final pass) - ; rsp[nextNDigits] # inner loop blocks in next outer loop iteration - ; rsp[pDstPtr] pDst running pointer outer loop - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx - - add r8, 64 - add rdi, 64 - - ; Write the 8-word carry-out to the destination - mov [r8 + 0*8], rsi - mov [r8 + 1*8], rbp - mov [r8 + 2*8], r10 - mov [r8 + 3*8], r11 - mov [r8 + 4*8], r12 - mov [r8 + 5*8], r13 - mov [r8 + 6*8], r14 - mov [r8 + 7*8], r15 - - add rcx, 64 - - ; r8 which is the destination pointer is shifted here by 2 digits - - xor rax, rax ; Sets Cy = Ov = 0 - - HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rax, rbx - - ; Cy = Ov = 0 because last 'sub r9d, 1' resulted in 0 - - ; Write the 8-word carry-out to the destination - mov [r8 + 8*8], rsi - mov [r8 + 9*8], rbp - mov [r8 + 10*8], r10 - mov [r8 + 11*8], r11 - mov [r8 + 12*8], r12 - mov [r8 + 13*8], r13 - mov [r8 + 14*8], r14 - mov [r8 + 15*8], r15 - - ; Compute diagonals, and add double the result so far - - sub rdi, 128 ; Revert rdi back to pSrcHome - mov rcx, rdi - mov r8, r9 - - xor rax, rax ; Sets Cy = Ov = 0 - - SYMCRYPT_SQUARE_DIAG 0 - SYMCRYPT_SQUARE_DIAG 1 - SYMCRYPT_SQUARE_DIAG 2 - SYMCRYPT_SQUARE_DIAG 3 - SYMCRYPT_SQUARE_DIAG 4 - SYMCRYPT_SQUARE_DIAG 5 - SYMCRYPT_SQUARE_DIAG 6 - SYMCRYPT_SQUARE_DIAG 7 - - SYMCRYPT_SQUARE_DIAG 8 - SYMCRYPT_SQUARE_DIAG 9 - SYMCRYPT_SQUARE_DIAG 10 - SYMCRYPT_SQUARE_DIAG 11 - SYMCRYPT_SQUARE_DIAG 12 - SYMCRYPT_SQUARE_DIAG 13 - SYMCRYPT_SQUARE_DIAG 14 - SYMCRYPT_SQUARE_DIAG 15 - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefRawSquareMulx1024, _TEXT - -;VOID -;SymCryptFdefMontgomeryReduce( -; _In_ PCSYMCRYPT_MODULUS pmMod, -; _In_ PUINT32 pSrc, -; _Out_ PUINT32 pDst ) - - NESTED_ENTRY SymCryptFdefMontgomeryReduceMulx1024, _TEXT - - ; We need all the registers - push_reg r12 - push_reg r13 - push_reg r14 - push_reg r15 - push_reg rdi - push_reg rsi - push_reg rbx - push_reg rbp - - END_PROLOGUE - - ; rcx = pmMod - ; rdx = pSrc = scratch buffer - ; r8 = pDst - - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome], rcx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], rdx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome], r8 - - mov r8, rdx - - mov eax, [rcx + SymCryptModulusNdigitsOffsetAmd64] - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax - ; CntOuter = nDigits - - xor ebx, ebx - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], ebx - ; HighCarry = 0 - -SymCryptFdefMontgomeryReduceMulx1024OuterLoop: - ; rcx = pmMod - ; r8 = pSrc = tmp buffer that we will reduce - mov rsi, [r8 + 0 * 8] - mov rbp, [r8 + 1 * 8] - mov r10, [r8 + 2 * 8] - mov r11, [r8 + 3 * 8] - mov r12, [r8 + 4 * 8] - mov r13, [r8 + 5 * 8] - mov r14, [r8 + 6 * 8] - mov r15, [r8 + 7 * 8] - - mov rdi, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64 - mov r9d, [rcx + SymCryptModulusNdigitsOffsetAmd64] - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - ; r8 = value to reduce - ; rsi - r15= r8[0..7] - ; rcx = modulus value - ; rdi = modinv - - MONTGOMERY18 rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8 + 0 * 8, rax, rbx - MONTGOMERY18 rbp, r10, r11, r12, r13, r14, r15, rsi, rdi, rcx, r8 + 1 * 8, rax, rbx - MONTGOMERY18 r10, r11, r12, r13, r14, r15, rsi, rbp, rdi, rcx, r8 + 2 * 8, rax, rbx - MONTGOMERY18 r11, r12, r13, r14, r15, rsi, rbp, r10, rdi, rcx, r8 + 3 * 8, rax, rbx - MONTGOMERY18 r12, r13, r14, r15, rsi, rbp, r10, r11, rdi, rcx, r8 + 4 * 8, rax, rbx - MONTGOMERY18 r13, r14, r15, rsi, rbp, r10, r11, r12, rdi, rcx, r8 + 5 * 8, rax, rbx - MONTGOMERY18 r14, r15, rsi, rbp, r10, r11, r12, r13, rdi, rcx, r8 + 6 * 8, rax, rbx - MONTGOMERY18 r15, rsi, rbp, r10, r11, r12, r13, r14, rdi, rcx, r8 + 7 * 8, rax, rbx - - ; rsi .. r15 = carry from multiply-add - ; r8[0..7] = Montgomery factors - - mov rdi, r8 ; factor to multiply by - add rcx, 64 - add r8, 64 - - ; rsi, rbp, r10, r11, r12, r13, r14, r15 8-word carry - ; rax, rbx temps for multiplication - ; rcx running pointer pMod inner loop - ; r8 running pointer pSrc inner loop - ; rdi Montgomery factors for this row - ; r9 loop ctr - ; rdx fixed input reg for multiplication - - MULADD88 rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx - ; pre & post: Cy = Ov = 0 - ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] - ; rdx is volatile - - add rcx, 64 - add r8, 64 - - ; We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry - ; We also saved a 1-bit carry from the previous outer loop - xor edx, edx - mov eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry] - ; move carry into Cy flag - neg eax - - ; We do this in separate instructions to help the instruction decoder build up a lead... - mov rax, [r8 + 0 * 8] - adc rax, rsi - mov [r8 + 0 * 8], rax - - mov rbx, [r8 + 1 * 8] - adc rbx, rbp - mov [r8 + 1 * 8], rbx - - mov rax, [r8 + 2 * 8] - adc rax, r10 - mov [r8 + 2 * 8], rax - - mov rbx, [r8 + 3 * 8] - adc rbx, r11 - mov [r8 + 3 * 8], rbx - - mov rax, [r8 + 4 * 8] - adc rax, r12 - mov [r8 + 4 * 8], rax - - mov rbx, [r8 + 5 * 8] - adc rbx, r13 - mov [r8 + 5 * 8], rbx - - mov rax, [r8 + 6 * 8] - adc rax, r14 - mov [r8 + 6 * 8], rax - - mov rbx, [r8 + 7 * 8] - adc rbx, r15 - mov [r8 + 7 * 8], rbx - - adc edx, edx ; edx = carry - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], edx - - mov r8, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome] - add r8, 64 - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], r8 - - mov rcx, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome] - - mov eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter] - sub eax, 1 - mov [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax - - jnz SymCryptFdefMontgomeryReduceMulx1024Outerloop - - ; edx = output carry - - mov esi, [rcx + SymCryptModulusNdigitsOffsetAmd64] - lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value - - mov rdi, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome] - - ; r8 = result buffer pointer - ; esi = # digits - ; rcx = modulus value - ; rdi = Dst - - ; copy these values for the maked copy loop - mov r9d, esi ; nDigits - mov r10, r8 ; result buffer - mov rbp, rdi ; destination pointer - - ; pDst = Reduction result - Modulus - - mov rax,[r8 + 0 * 8] - sbb rax,[rcx + 0 * 8] - mov [rdi + 0 * 8], rax - - mov rbx,[r8 + 1 * 8] - sbb rbx,[rcx + 1 * 8] - mov [rdi + 1 * 8], rbx - - mov rax,[r8 + 2 * 8] - sbb rax,[rcx + 2 * 8] - mov [rdi + 2 * 8], rax - - mov rbx,[r8 + 3 * 8] - sbb rbx,[rcx + 3 * 8] - mov [rdi + 3 * 8], rbx - - mov rax,[r8 + 4 * 8] - sbb rax,[rcx + 4 * 8] - mov [rdi + 4 * 8], rax - - mov rbx,[r8 + 5 * 8] - sbb rbx,[rcx + 5 * 8] - mov [rdi + 5 * 8], rbx - - mov rax,[r8 + 6 * 8] - sbb rax,[rcx + 6 * 8] - mov [rdi + 6 * 8], rax - - mov rbx,[r8 + 7 * 8] - sbb rbx,[rcx + 7 * 8] - mov [rdi + 7 * 8], rbx - - mov rax,[r8 + 8 * 8] - sbb rax,[rcx + 8 * 8] - mov [rdi + 8 * 8], rax - - mov rbx,[r8 + 9 * 8] - sbb rbx,[rcx + 9 * 8] - mov [rdi + 9 * 8], rbx - - mov rax,[r8 + 10 * 8] - sbb rax,[rcx + 10 * 8] - mov [rdi + 10 * 8], rax - - mov rbx,[r8 + 11 * 8] - sbb rbx,[rcx + 11 * 8] - mov [rdi + 11 * 8], rbx - - mov rax,[r8 + 12 * 8] - sbb rax,[rcx + 12 * 8] - mov [rdi + 12 * 8], rax - - mov rbx,[r8 + 13 * 8] - sbb rbx,[rcx + 13 * 8] - mov [rdi + 13 * 8], rbx - - mov rax,[r8 + 14 * 8] - sbb rax,[rcx + 14 * 8] - mov [rdi + 14 * 8], rax - - mov rbx,[r8 + 15 * 8] - sbb rbx,[rcx + 15 * 8] - mov [rdi + 15 * 8], rbx - - - ; now a masked copy from the reduction buffer to the destination. - ; copy if high carry = 0 and Cy = 1 - sbb edx, 0 - ; edx = copy mask, ff...ff if copy, 0 of no copy - - movd xmm0, edx ; xmm0[0] = mask - pcmpeqd xmm1, xmm1 ; xmm1 = ff...ff - pshufd xmm0, xmm0, 0 ; xmm0[0..3] = mask - pxor xmm1, xmm0 ; xmm1 = not Mask - - - movdqa xmm2, [r10 + 0 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 0 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 0 * 16], xmm2 - - movdqa xmm2, [r10 + 1 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 1 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 1 * 16], xmm2 - - movdqa xmm2, [r10 + 2 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 2 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 2 * 16], xmm2 - - movdqa xmm2, [r10 + 3 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 3 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 3 * 16], xmm2 - - movdqa xmm2, [r10 + 4 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 4 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 4 * 16], xmm2 - - movdqa xmm2, [r10 + 5 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 5 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 5 * 16], xmm2 - - movdqa xmm2, [r10 + 6 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 6 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 6 * 16], xmm2 - - movdqa xmm2, [r10 + 7 * 16] ; xmm2 = pSrc[0] - movdqa xmm3, [rbp + 7 * 16] ; xmm3 = pDst[0] - pand xmm2, xmm0 - pand xmm3, xmm1 - por xmm2, xmm3 - movdqa [rbp + 7 * 16], xmm2 - - - BEGIN_EPILOGUE - - pop rbp - pop rbx - pop rsi - pop rdi - pop r15 - pop r14 - pop r13 - pop r12 - ret - - NESTED_END SymCryptFdefMontgomeryReduceMulx1024, _TEXT - - -;============================================================================= -; test code - -MULX_TEST_1 MACRO - mulx rax, rcx, [r8 + 8] - adcx r10, rcx - adox r11, rax - ENDM - -MULX_TEST_4 MACRO - MULX_TEST_1 - MULX_TEST_1 - MULX_TEST_1 - MULX_TEST_1 - ENDM - -MULX_TEST_16 MACRO - MULX_TEST_4 - MULX_TEST_4 - MULX_TEST_4 - MULX_TEST_4 - ENDM - -MULX_TEST_64 MACRO - MULX_TEST_16 - MULX_TEST_16 - MULX_TEST_16 - MULX_TEST_16 - ENDM - -MULX_TEST_256 MACRO - MULX_TEST_64 - MULX_TEST_64 - MULX_TEST_64 - MULX_TEST_64 - ENDM - -MULX_TEST_1024 MACRO - MULX_TEST_256 - MULX_TEST_256 - MULX_TEST_256 - MULX_TEST_256 - ENDM - - LEAF_ENTRY SymCryptTestMulx, _TEXT - - mov r8, rsp - - MULX_TEST_1024 - - ret - LEAF_END SymCryptTestMulx, _TEXT - - - - end diff --git a/lib/amd64/fdef_mulx.symcryptasm b/lib/amd64/fdef_mulx.symcryptasm new file mode 100644 index 0000000..c072fc5 --- /dev/null +++ b/lib/amd64/fdef_mulx.symcryptasm @@ -0,0 +1,1265 @@ +// +// fdef_mulx.symcryptasm Assembler code for large integer arithmetic in the default data format +// using the bmi2 instructions mulx, adcx and adox +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. +// + +#include "symcryptasm_shared.cppasm" + +MACRO_START(ZEROREG, R) + xor R,R +MACRO_END() + +MACRO_START(ZEROREG_8, R0, R1, R2, R3, R4, R5, R6, R7) + ZEROREG R0 + ZEROREG R1 + ZEROREG R2 + ZEROREG R3 + ZEROREG R4 + ZEROREG R5 + ZEROREG R6 + ZEROREG R7 +MACRO_END() + +MACRO_START(MULADD18, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1, QH) + // R0:R[7:1]:D[0] = A[7:0] * B[0] + D[0] + R[7:0] + // Pre: Cy = Ov = 0 + // Post: Cy = Ov = 0 + + mov QH, [pB] + adox R0, [pD] + + mulx T1, T0, [pA + 0 * 8] + adcx R0, T0 + adox R1, T1 + + mulx T1, T0, [pA + 1 * 8] + adcx R1, T0 + adox R2, T1 + + mulx T1, T0, [pA + 2 * 8] + adcx R2, T0 + adox R3, T1 + + mulx T1, T0, [pA + 3 * 8] + adcx R3, T0 + adox R4, T1 + + mulx T1, T0, [pA + 4 * 8] + adcx R4, T0 + adox R5, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R5, T0 + adox R6, T1 + + mulx T1, T0, [pA + 6 * 8] + adcx R6, T0 + adox R7, T1 + + mulx T1, T0, [pA + 7 * 8] + adcx R7, T0 + mov [pD], R0 + + mov R0, 0 + adcx R0, R0 + adox R0, T1 +MACRO_END() + +MACRO_START(MULADD88, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1, QH) + // pre & post: Cy = Ov = 0 + // R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] + + MULADD18 R0, R1, R2, R3, R4, R5, R6, R7, pD , pA, pB , T0, T1, QH + MULADD18 R1, R2, R3, R4, R5, R6, R7, R0, pD + 8, pA, pB + 8, T0, T1, QH + MULADD18 R2, R3, R4, R5, R6, R7, R0, R1, pD + 16, pA, pB + 16, T0, T1, QH + MULADD18 R3, R4, R5, R6, R7, R0, R1, R2, pD + 24, pA, pB + 24, T0, T1, QH + MULADD18 R4, R5, R6, R7, R0, R1, R2, R3, pD + 32, pA, pB + 32, T0, T1, QH + MULADD18 R5, R6, R7, R0, R1, R2, R3, R4, pD + 40, pA, pB + 40, T0, T1, QH + MULADD18 R6, R7, R0, R1, R2, R3, R4, R5, pD + 48, pA, pB + 48, T0, T1, QH + MULADD18 R7, R0, R1, R2, R3, R4, R5, R6, pD + 56, pA, pB + 56, T0, T1, QH +MACRO_END() + + +MACRO_START(HALF_SQUARE_NODIAG8, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, T0, T1, QH) + // pre & post: Cy = Ov = 0 + // R[7-0]:D[7-0] = D[7:0] + (A[0:7]^2 - \sum_{i=0}^7 (A[i] * 2^{64*i}) )/2 + // This is the component of the square that needs to be doubled, and then the diagonals added + + // Note that Dst[0] is not changed by this macro + + mov QH, [pA + 0 * 8] // QH = A0 + mov R1, [pD + 1 * 8] + mov R2, [pD + 2 * 8] + mov R3, [pD + 3 * 8] + mov R4, [pD + 4 * 8] + mov R5, [pD + 5 * 8] + mov R6, [pD + 6 * 8] + mov R7, [pD + 7 * 8] + xor R0, R0 + + mulx T1, T0, [pA + 1 * 8] + adcx R1, T0 + adox R2, T1 + + mulx T1, T0, [pA + 2 * 8] + adcx R2, T0 + adox R3, T1 + + mulx T1, T0, [pA + 3 * 8] + adcx R3, T0 + adox R4, T1 + + mulx T1, T0, [pA + 4 * 8] + adcx R4, T0 + adox R5, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R5, T0 + adox R6, T1 + + mulx T1, T0, [pA + 6 * 8] + adcx R6, T0 + adox R7, T1 + + mulx T1, T0, [pA + 7 * 8] + adcx R7, T0 + mov [pD + 1 * 8], R1 + + adcx R0, R0 + adox R0, T1 + mov [pD + 2 * 8], R2 + mov QH, [pA + 1 * 8] // QH = A1 + + //======= + + mulx T1, T0, [pA + 2 * 8] + adcx R3, T0 + adox R4, T1 + + mulx T1, T0, [pA + 3 * 8] + adcx R4, T0 + adox R5, T1 + + mulx T1, T0, [pA + 4 * 8] + adcx R5, T0 + adox R6, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R6, T0 + adox R7, T1 + + mulx T1, T0, [pA + 6 * 8] + adcx R7, T0 + adox R0, T1 + + mov QH, [pA + 7 * 8] // QH = A7 + mov R1, 0 + mov R2, 0 + mov [pD + 3 * 8], R3 + + mulx T1, T0, [pA + 1 * 8] + adcx R0, T0 + adox R1, T1 // doesn't produce Ov as T1 <= 0xff..fe and R1=0 + + mulx T1, T0, [pA + 2 * 8] + adcx R1, T0 + mov [pD + 4 * 8], R4 + + adcx R2, T1 + mov QH, [pA + 2 * 8] // QH = A2 + + //====== + + mulx T1, T0, [pA + 3 * 8] + adcx R5, T0 + adox R6, T1 + + mulx T1, T0, [pA + 4 * 8] + adcx R6, T0 + adox R7, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R7, T0 + adox R0, T1 + + mulx T1, T0, [pA + 6 * 8] + adcx R0, T0 + adox R1, T1 + + mov QH, [pA + 4 * 8] // QH = A4 + mov R3, 0 + mov R4, 0 + + mulx T1, T0, [pA + 5 * 8] + adcx R1, T0 + adox R2, T1 + + mulx T1,T0, [pA + 6 * 8] + adcx R2, T0 + adox R3, T1 // doesn't produce Ov as T1 <= 0xff..fe and R3=0 + + mov QH, [pA + 5 * 8] // QH = A5 + mov [pD + 5 * 8], R5 + + mulx T1, T0, [pA + 6 * 8] + adcx R3, T0 + adcx R4, T1 + + mov QH, [pA + 3 * 8] // QH = A3 + mov [pD + 6 * 8], R6 + + //====== + + mulx T1, T0, [pA + 4 * 8] + adcx R7, T0 + adox R0, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R0, T0 + adox R1, T1 + + mulx T1, T0, [pA + 6 * 8] + adcx R1, T0 + adox R2, T1 + + mulx T1, T0, [pA + 7 * 8] + adcx R2, T0 + adox R3, T1 + + mov QH, [pA + 7 * 8] // QH = A7 + mov R5, 0 + mov R6, 0 + mov [pD + 7 * 8], R7 + + mulx T1, T0, [pA + 4 * 8] + adcx R3, T0 + adox R4, T1 + + mulx T1, T0, [pA + 5 * 8] + adcx R4, T0 + adox R5, T1 // doesn't produce Ov as T1 <= 0xff..fe and R5=0 + + mulx T1, T0, [pA + 6 * 8] + adcx R5, T0 + adcx R6, T1 + + xor R7, R7 +MACRO_END() + +MACRO_START(MONTGOMERY18, R0, R1, R2, R3, R4, R5, R6, R7, modInv, pMod, pMont, T0, T1, QH) + // Mont[0] = (modinv * R0 mod 2^64) + // R0:R[7:1]: = Mont[0] * Mod[7:0] + R[7:0] + // Pre: - + // Post: - + + mov QH, R0 + imul QH, modInv + + // Rather than add the low half of the first mulx to R0 we can go ahead and set + // up the Cy flag appropriately based on R0 directly (the addition will always + // result in 0 by construction), so we can have the result while imul is running + + // This has a small but measurable perf improvement on SKLX (~2% improvement for + // 512b modmul) + // and it seems unlikely that it can make the performance worse + // My best guess as to why is that allowing this to execute a few cycles early + // can reduce port contention when the macro is being speculatively executed + or T0, -1 // Clear Cy and Ov + adcx R0, T0 // Set Cy when R0 is non-zero + mov R0, 0 + mov [pMont], QH + + mulx T1, T1, [pMod + 0 * 8] + adox R1, T1 + + mulx T1, T0, [pMod + 1 * 8] + adcx R1, T0 + adox R2, T1 + + mulx T1, T0, [pMod + 2 * 8] + adcx R2, T0 + adox R3, T1 + + mulx T1, T0, [pMod + 3 * 8] + adcx R3, T0 + adox R4, T1 + + mulx T1, T0, [pMod + 4 * 8] + adcx R4, T0 + adox R5, T1 + + mulx T1, T0, [pMod + 5 * 8] + adcx R5, T0 + adox R6, T1 + + mulx T1, T0, [pMod + 6 * 8] + adcx R6, T0 + adox R7, T1 + + mulx T1, T0, [pMod + 7 * 8] + adcx R7, T0 + + adcx R0, R0 + adox R0, T1 +MACRO_END() + +MACRO_START(SYMCRYPT_SQUARE_DIAG, index, src_reg, dest_reg, T0, T1, T2, T3, QH) + mov QH, [src_reg + 8 * index] + mov T0, [dest_reg + 16 * index] + mov T1, [dest_reg + 16 * index + 8] + mulx T3, T2, QH + adcx T2, T0 + adox T2, T0 + adcx T3, T1 + adox T3, T1 + mov [dest_reg + 16 * index], T2 + mov [dest_reg + 16 * index + 8], T3 +MACRO_END() + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawMulMulx( +// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, +// UINT32 nDigits1, +// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, +// UINT32 nDigits2, +// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawMulMulx, 5, 14) + + shl Q4, 6 + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q4 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], D2 + + // First we wipe nDigits2 of the result (size of in) + mov Q6, Q5 + + // Wipe destination for nDigit2 blocks + xorps xmm0,xmm0 // Zero register for 16-byte wipes + mov Q0, Q4 + +SymCryptFdefRawMulMulxWipeLoop: + movaps [Q6],xmm0 + movaps [Q6+16],xmm0 // Wipe 32 bytes + movaps [Q6+32],xmm0 // Wipe 32 bytes + movaps [Q6+48],xmm0 // Wipe 32 bytes + add Q6, 64 + sub Q0, 64 + jnz SymCryptFdefRawMulMulxWipeLoop + + +SymCryptFdefRawMulxOuterLoop: + + ZEROREG_8 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 // Leaves Cy = Ov = 0 + +SymCryptFdefRawMulMulxInnerLoop: + + // Register allocation in loops: + // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 8-word carry + // Q0, Q2 temps for multiplication + // Q1, Q3 pSrc1, pSrc2 running pointers + // Q4 inner loop counter + // QH fixed input reg for multiplication + // Q5 Destination running pointer inner loop + // slot0 nDigits2*64 + // slot1 outer loop counter + + MULADD88 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q5, Q1, Q3, Q0, Q2, QH + + add Q3, 64 // Src2 ptr + add Q5, 64 + + sub D4, 64 // sets Cy = Ov = 0 because 64*nDigits2 < 2^32 + jnz SymCryptFdefRawMulMulxInnerLoop + + // Write the 8-word carry-out to the destination + mov [Q5 + 0*8], Q6 + mov [Q5 + 1*8], Q7 + mov [Q5 + 2*8], Q8 + mov [Q5 + 3*8], Q9 + mov [Q5 + 4*8], Q10 + mov [Q5 + 5*8], Q11 + mov [Q5 + 6*8], Q12 + mov [Q5 + 7*8], Q13 + + // set up for next iteration + // reload 64*nDigits2 + mov Q4, [rsp + GET_MEMSLOT_OFFSET(slot0)] + + // reset Q5 & increment + sub Q5, Q4 + add Q5, 64 + // reset Q3 + sub Q3, Q4 + + // update PSrc1 + add Q1, 64 + + // nDigits1 loop counter + mov D2, [rsp + GET_MEMSLOT_OFFSET(slot1)] + sub D2, 1 // sets Cy = Ov = 0 because nDigits1 < 2^32 / 64 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], D2 + jnz SymCryptFdefRawMulxOuterLoop + +MUL_FUNCTION_END(SymCryptFdefRawMulMulx) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawSquareMulx( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawSquareMulx, 3, 14) + + // Q1 = pSrc + // Q2 = nDigits + // Q3 = pDst + + // Save parameters for phase 2 + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1 // save pSrc + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2 // save nDigits + mov [rsp + GET_MEMSLOT_OFFSET(slot2)], Q3 // save pDst + + shl Q2, 6 // nDigits * 64 = # bytes in Src to process + mov [rsp + GET_MEMSLOT_OFFSET(slot3)], Q2 // save # bytes in Src to process + + // Wipe destination for nDigits blocks + xor Q0, Q0 + mov Q5, Q3 + mov Q4, Q2 + +SymCryptFdefRawSquareMulxWipeLoop: + // we use 8-byte writes as we will be reading this very soon in 8-byte chunks, and this way the store-load + // forwarding works + mov [Q5 ], Q0 + mov [Q5 + 8], Q0 + mov [Q5 + 16], Q0 + mov [Q5 + 24], Q0 + mov [Q5 + 32], Q0 + mov [Q5 + 40], Q0 + mov [Q5 + 48], Q0 + mov [Q5 + 56], Q0 + add Q5, 64 + sub Q4, 64 + jnz SymCryptFdefRawSquareMulxWipeLoop + + // Cy = Ov = 0 here because the last 'sub Q4,64' yielded 0 + +SymCryptFdefRawSquareMulxOuterLoop: + + HALF_SQUARE_NODIAG8 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q0, Q4, QH + + sub Q2, 64 + jz SymCryptFdefRawSquareMulxPhase2 // end of phase 1 + + lea Q5, [Q1 + 64] + lea Q3, [Q3 + 64] + +SymCryptFdefRawSquareMulxInnerLoop: + // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 8-word carry + // Q0, Q4 temps for multiplication + // Q1 pSrc running pointer outer loop + // Q2 # bytes left in pSrc to process in the inner loop + // Q3 pDst running pointer inner loop + // QH fixed input reg for multiplication + // Q5 pSrc running pointer inner loop + + // slot0 pSrc (used for final pass) + // slot1 nDigits (used for final pass) + // slot2 pDst (used for final pass) + // slot3 # bytes to process from pSrc in this iteration + + MULADD88 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q5, Q0, Q4, QH + + add Q3, 64 + add Q5, 64 + + sub Q2, 64 // Sets Cy = Ov = 0 because nDigits < 2^32 / bits_per_digit + jnz SymCryptFdefRawSquareMulxInnerLoop + + // Write the 8-word carry-out to the destination + mov [Q3 + 0*8], Q6 + mov [Q3 + 1*8], Q7 + mov [Q3 + 2*8], Q8 + mov [Q3 + 3*8], Q9 + mov [Q3 + 4*8], Q10 + mov [Q3 + 5*8], Q11 + mov [Q3 + 6*8], Q12 + mov [Q3 + 7*8], Q13 + + mov Q2, [rsp + GET_MEMSLOT_OFFSET(slot3)] // restore # bytes in Src to process next + + add Q1, 64 // Shift outer Src pointer by 1 digit + sub Q3, Q2 // reset output ptr + add Q3, 128 // Shift output ptr by 2 digits + + sub Q2, 64 // Reduce number of bytes to process by 1 digit + mov [rsp + GET_MEMSLOT_OFFSET(slot3)], Q2 + + jmp SymCryptFdefRawSquareMulxOuterLoop + + +SymCryptFdefRawSquareMulxPhase2: + // Cy = Ov = 0 because last 'sub Q2, 64' resulted in 0 + + // Write the 8-word carry-out to the destination + mov [Q3 + 8*8], Q6 + mov [Q3 + 9*8], Q7 + mov [Q3 + 10*8], Q8 + mov [Q3 + 11*8], Q9 + mov [Q3 + 12*8], Q10 + mov [Q3 + 13*8], Q11 + mov [Q3 + 14*8], Q12 + mov [Q3 + 15*8], Q13 + + // Compute diagonals, and add double the result so far + + mov Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)] + mov Q2, [rsp + GET_MEMSLOT_OFFSET(slot1)] + mov Q3, [rsp + GET_MEMSLOT_OFFSET(slot2)] + + // We can't keep the carries in Cy and Ov because there is no way to do a loop counter + // without touching the Ov flag. + // So we set the Ov carry in Q0, and retain a zero in Q4 + xor Q0, Q0 + xor Q4, Q4 + +SymCryptFdefRawSquareMulxDiagonalsLoop: + // Cy = carry in + // Q0 = carry in (1 bit) + // Ov = 0 + + // First word is different to handle the carry + // SYMCRYPT_SQUARE_DIAG 0, Q1, Q3, Q5, Q6, Q7, Q8, QH + mov QH, [Q1] + mov Q5, [Q3] + mov Q6, [Q3 + 8] + mulx Q8, Q7, QH + adcx Q7, Q0 // add both carries + adcx Q8, Q4 // Q4 = 0 - now Cy = 0 because result of multiply <= ff..fe00..01 + + adcx Q7, Q5 + adox Q7, Q5 + adcx Q8, Q6 + adox Q8, Q6 + mov [Q3], Q7 + mov [Q3 + 8], Q8 + + SYMCRYPT_SQUARE_DIAG 1, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 2, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 3, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 4, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 5, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 6, Q1, Q3, Q5, Q6, Q7, Q8, QH + SYMCRYPT_SQUARE_DIAG 7, Q1, Q3, Q5, Q6, Q7, Q8, QH + + // Move the Ov flag into Q0 + mov D0, D4 + adox D0, D4 + + // There is no way to do a loop counter without overwriting the Ov flag + // Even the 'dec' instruction touches it, and LAHF/SAHF doesn't load/store the Ov flag. + // We can't push/pop efl in a function body + + lea Q1, [Q1 + 64] + lea Q3, [Q3 + 128] + dec Q2 + jnz SymCryptFdefRawSquareMulxDiagonalsLoop + +MUL_FUNCTION_END(SymCryptFdefRawSquareMulx) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefMontgomeryReduceMulx( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _In_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceMulx, 3, 14) + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2 + mov [rsp + GET_MEMSLOT_OFFSET(slot2)], Q3 + + mov D0, [Q1 + SymCryptModulusNdigitsOffsetAmd64] + mov [rsp + GET_MEMSLOT_OFFSET(slot3)], D0 + // CntOuter = nDigits - using first half of slot3 + + xor D4, D4 + mov [rsp + GET_MEMSLOT_OFFSET(slot3) + 4], D4 + // HighCarry = 0 - using second half of slot3 + +SymCryptFdefMontgomeryReduceMulxOuterLoop: + // Q1 = pmMod + // Q2 = pSrc = tmp buffer that we will reduce + mov Q6, [Q2 + 0 * 8] + mov Q7, [Q2 + 1 * 8] + mov Q8, [Q2 + 2 * 8] + mov Q9, [Q2 + 3 * 8] + mov Q10, [Q2 + 4 * 8] + mov Q11, [Q2 + 5 * 8] + mov Q12, [Q2 + 6 * 8] + mov Q13, [Q2 + 7 * 8] + + mov Q3, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + // Q2 = value to reduce + // Q6 - Q13 = Q2[0..7] + // Q1 = modulus value + // Q3 = modinv + + MONTGOMERY18 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q2 + (0 * 8), Q0, Q5, QH + MONTGOMERY18 Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q3, Q1, Q2 + (1 * 8), Q0, Q5, QH + MONTGOMERY18 Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q3, Q1, Q2 + (2 * 8), Q0, Q5, QH + MONTGOMERY18 Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q3, Q1, Q2 + (3 * 8), Q0, Q5, QH + MONTGOMERY18 Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q3, Q1, Q2 + (4 * 8), Q0, Q5, QH + MONTGOMERY18 Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q3, Q1, Q2 + (5 * 8), Q0, Q5, QH + MONTGOMERY18 Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q3, Q1, Q2 + (6 * 8), Q0, Q5, QH + MONTGOMERY18 Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q1, Q2 + (7 * 8), Q0, Q5, QH + + // Q6 - Q13 = carry from multiply-add + // Q2[0..7] = Montgomery factors + + mov Q3, Q2 // factor to multiply by + add Q1, 64 + add Q2, 64 + + dec D4 + jz SymCryptFdefMontgomeryReduceMulxInnerLoopDone + +SymCryptFdefMontgomeryReduceMulxInnerLoop: + + // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 8-word carry + // Q0, Q5 temps for multiplication + // Q1 running pointer pMod inner loop + // Q2 running pointer pSrc inner loop + // Q3 Montgomery factors for this row + // D4 loop ctr + // QH fixed input reg for multiplication + + MULADD88 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q2, Q1, Q3, Q0, Q5, QH + // pre & post: Cy = Ov = 0 + // Q13..Q6:Q2[7-0] = R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] + // QH is volatile + + add Q1, 64 + add Q2, 64 + dec D4 + jnz SymCryptFdefMontgomeryReduceMulxInnerLoop + + +SymCryptFdefMontgomeryReduceMulxInnerLoopDone: + + // We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry + // We also saved a 1-bit carry from the previous outer loop + mov D5, [rsp + GET_MEMSLOT_OFFSET(slot3) + 4] + // move carry into Cy flag + neg D5 + + // We do this in separate instructions to help the instruction decoder build up a lead... + mov Q0, [Q2 + 0 * 8] + adc Q0, Q6 + mov [Q2 + 0 * 8], Q0 + + mov Q5, [Q2 + 1 * 8] + adc Q5, Q7 + mov [Q2 + 1 * 8], Q5 + + mov Q0, [Q2 + 2 * 8] + adc Q0, Q8 + mov [Q2 + 2 * 8], Q0 + + mov Q5, [Q2 + 3 * 8] + adc Q5, Q9 + mov [Q2 + 3 * 8], Q5 + + mov Q0, [Q2 + 4 * 8] + adc Q0, Q10 + mov [Q2 + 4 * 8], Q0 + + mov Q5, [Q2 + 5 * 8] + adc Q5, Q11 + mov [Q2 + 5 * 8], Q5 + + mov Q0, [Q2 + 6 * 8] + adc Q0, Q12 + mov [Q2 + 6 * 8], Q0 + + mov Q5, [Q2 + 7 * 8] + adc Q5, Q13 + mov [Q2 + 7 * 8], Q5 + + adc D4, D4 // D4 = carry (D4 was previously zero) + mov [rsp + GET_MEMSLOT_OFFSET(slot3) + 4], D4 + + mov Q2, [rsp + GET_MEMSLOT_OFFSET(slot1)] + add Q2, 64 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2 + + mov Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)] + + mov D0, [rsp + GET_MEMSLOT_OFFSET(slot3)] + dec D0 + mov [rsp + GET_MEMSLOT_OFFSET(slot3)], D0 + + jnz SymCryptFdefMontgomeryReduceMulxOuterLoop + + // D4 = output carry + + mov D6, [Q1 + SymCryptModulusNdigitsOffsetAmd64] + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + + mov Q3, [rsp + GET_MEMSLOT_OFFSET(slot2)] + + // Q2 = result buffer pointer + // D6 = # digits + // Q1 = modulus value + // Q3 = Dst + + // copy these values for the masked copy loop + mov D7, D6 // nDigits + mov Q8, Q2 // result buffer + mov Q9, Q3 // destination pointer + + // pDst = Reduction result - Modulus + +SymCryptFdefMontgomeryReduceMulxSubLoop: + mov Q0,[Q2 + 0 * 8] + sbb Q0,[Q1 + 0 * 8] + mov [Q3 + 0 * 8], Q0 + + mov Q5,[Q2 + 1 * 8] + sbb Q5,[Q1 + 1 * 8] + mov [Q3 + 1 * 8], Q5 + + mov Q0,[Q2 + 2 * 8] + sbb Q0,[Q1 + 2 * 8] + mov [Q3 + 2 * 8], Q0 + + mov Q5,[Q2 + 3 * 8] + sbb Q5,[Q1 + 3 * 8] + mov [Q3 + 3 * 8], Q5 + + mov Q0,[Q2 + 4 * 8] + sbb Q0,[Q1 + 4 * 8] + mov [Q3 + 4 * 8], Q0 + + mov Q5,[Q2 + 5 * 8] + sbb Q5,[Q1 + 5 * 8] + mov [Q3 + 5 * 8], Q5 + + mov Q0,[Q2 + 6 * 8] + sbb Q0,[Q1 + 6 * 8] + mov [Q3 + 6 * 8], Q0 + + mov Q5,[Q2 + 7 * 8] + sbb Q5,[Q1 + 7 * 8] + mov [Q3 + 7 * 8], Q5 + + lea Q2, [Q2 + 64] + lea Q1, [Q1 + 64] + lea Q3, [Q3 + 64] + dec D6 + jnz SymCryptFdefMontgomeryReduceMulxSubLoop + + // now a masked copy from the reduction buffer to the destination. + // copy if high carry = 0 and Cy = 1 + sbb D4, 0 + // D4 = copy mask, ff...ff if copy, 0 of no copy + + movd xmm0, D4 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + +SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop: + movdqa xmm2, [Q8 + 0 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q9 + 0 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q9 + 0 * 16], xmm2 + + movdqa xmm2, [Q8 + 1 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q9 + 1 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q9 + 1 * 16], xmm2 + + movdqa xmm2, [Q8 + 2 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q9 + 2 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q9 + 2 * 16], xmm2 + + movdqa xmm2, [Q8 + 3 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q9 + 3 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q9 + 3 * 16], xmm2 + + // Move on to the next digit + + add Q8, 64 + add Q9, 64 + dec D7 + jnz SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceMulx) + +// -------------------------------- +// 1024-bit size specific functions +// -------------------------------- + +//VOID +//SYMCRYPT_CALL +//SymCryptFdefRawMul( +// _In_reads_(nWords1) PCUINT32 pSrc1, +// _In_reads_(nWords2) PCUINT32 pSrc2, +// UINT32 nDigits, +// _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawMulMulx1024, 4, 13) + + // First we wipe nDigits of the result (size of in) + // Q1 = pSrc1 + // Q2 = pSrc2 + // Q3 = nDigits + // Q4 = pDst + + // Wipe destination for nDigit2 blocks + xorps xmm0,xmm0 // Zero register for 16-byte wipes + + movaps [Q4],xmm0 + movaps [Q4+16],xmm0 // Wipe 32 bytes + movaps [Q4+32],xmm0 // Wipe 32 bytes + movaps [Q4+48],xmm0 // Wipe 32 bytes + + movaps [Q4+64],xmm0 + movaps [Q4+80],xmm0 // Wipe 32 bytes + movaps [Q4+96],xmm0 // Wipe 32 bytes + movaps [Q4+112],xmm0 // Wipe 32 bytes + + // Digit 1 from src2 + + ZEROREG_8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12 // Leaves Cy = Ov = 0 + + MULADD88 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH + + add Q2, 64 // Src2 ptr + add Q4, 64 + xor Q0, Q0 // sets Cy = Ov = 0 + + MULADD88 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH + + add Q4, 64 + + // Write the 8-word carry-out to the destination + mov [Q4 + 0*8], Q5 + mov [Q4 + 1*8], Q6 + mov [Q4 + 2*8], Q7 + mov [Q4 + 3*8], Q8 + mov [Q4 + 4*8], Q9 + mov [Q4 + 5*8], Q10 + mov [Q4 + 6*8], Q11 + mov [Q4 + 7*8], Q12 + + // Digit 2 from src2 + + // set up + + // Mov Q4 one digit back + sub Q4, 64 + + // reload pSrc2 + sub Q2, 64 + + // update PSrc1 + add Q1, 64 + + ZEROREG_8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12 // Leaves Cy = Ov = 0 + + MULADD88 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH + + add Q2, 64 // Src2 ptr + add Q4, 64 + xor Q0, Q0 // sets Cy = Ov = 0 + + MULADD88 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH + + add Q4, 64 + + // Write the 8-word carry-out to the destination + mov [Q4 + 0*8], Q5 + mov [Q4 + 1*8], Q6 + mov [Q4 + 2*8], Q7 + mov [Q4 + 3*8], Q8 + mov [Q4 + 4*8], Q9 + mov [Q4 + 5*8], Q10 + mov [Q4 + 6*8], Q11 + mov [Q4 + 7*8], Q12 + +MUL_FUNCTION_END(SymCryptFdefRawMulMulx1024) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefRawSquareMulx1024( +// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, +// UINT32 nDigits, +// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) + +MUL_FUNCTION_START(SymCryptFdefRawSquareMulx1024, 3, 13) + + // Wipe 128 bytes of destination + xorps xmm0,xmm0 // Zero register for 16-byte wipes + + movaps [Q3],xmm0 + movaps [Q3+16],xmm0 + movaps [Q3+32],xmm0 + movaps [Q3+48],xmm0 + + movaps [Q3+64],xmm0 + movaps [Q3+80],xmm0 + movaps [Q3+96],xmm0 + movaps [Q3+112],xmm0 + + xor Q0, Q0 // Sets Cy = Ov = 0 + + HALF_SQUARE_NODIAG8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q1, Q0, Q2, QH + + lea Q4, [Q1 + 64] // Q4 = pSrc + 64 + lea Q3, [Q3 + 64] // Q3 = pDst + 64 + + // Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12 8-word carry + // Q0, Q2 temps for multiplication + // Q1 pSrc (constant) + // Q4 pSrc + 64 (constant) + // Q3 pDst running pointer + // QH fixed input reg for multiplication + + MULADD88 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q1, Q4, Q0, Q2, QH + + add Q3, 64 // Q3 = pDst + 128 + + // Write the 8-word carry-out to the destination + mov [Q3 + 0*8], Q5 + mov [Q3 + 1*8], Q6 + mov [Q3 + 2*8], Q7 + mov [Q3 + 3*8], Q8 + mov [Q3 + 4*8], Q9 + mov [Q3 + 5*8], Q10 + mov [Q3 + 6*8], Q11 + mov [Q3 + 7*8], Q12 + + // Q3 which is the destination pointer is shifted here by 2 digits + + xor Q0, Q0 // Sets Cy = Ov = 0 + + HALF_SQUARE_NODIAG8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q4, Q0, Q2, QH + + // Write the 8-word carry-out to the destination + mov [Q3 + 8*8], Q5 + mov [Q3 + 9*8], Q6 + mov [Q3 + 10*8], Q7 + mov [Q3 + 11*8], Q8 + mov [Q3 + 12*8], Q9 + mov [Q3 + 13*8], Q10 + mov [Q3 + 14*8], Q11 + mov [Q3 + 15*8], Q12 + + // Compute diagonals, and add double the result so far + + sub Q3, 128 // Q3 = pDst - sets Cy = Ov = 0 + + SYMCRYPT_SQUARE_DIAG 0, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 1, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 2, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 3, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 4, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 5, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 6, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 7, Q1, Q3, Q0, Q2, Q4, Q5, QH + + SYMCRYPT_SQUARE_DIAG 8, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 9, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 10, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 11, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 12, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 13, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 14, Q1, Q3, Q0, Q2, Q4, Q5, QH + SYMCRYPT_SQUARE_DIAG 15, Q1, Q3, Q0, Q2, Q4, Q5, QH + +MUL_FUNCTION_END(SymCryptFdefRawSquareMulx1024) + +// VOID +// SYMCRYPT_CALL +// SymCryptFdefMontgomeryReduceMulx1024( +// _In_ PCSYMCRYPT_MODULUS pmMod, +// _Inout_ PUINT32 pSrc, +// _Out_ PUINT32 pDst ) +MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceMulx1024, 3, 14) + + mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3 + + mov D0, 2 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], D0 + // CntOuter = nDigits - using first half of slot3 + + xor D4, D4 + lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value + +SymCryptFdefMontgomeryReduceMulx1024OuterLoop: + // Q1 = pmMod + // Q2 = pSrc = tmp buffer that we will reduce + mov Q6, [Q2 + 0 * 8] + mov Q7, [Q2 + 1 * 8] + mov Q8, [Q2 + 2 * 8] + mov Q9, [Q2 + 3 * 8] + mov Q10, [Q2 + 4 * 8] + mov Q11, [Q2 + 5 * 8] + mov Q12, [Q2 + 6 * 8] + mov Q13, [Q2 + 7 * 8] + + mov Q3, [Q1 - SymCryptModulusValueOffsetAmd64 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64 + + // Q2 = value to reduce + // Q6 - Q13 = Q2[0..7] + // Q1 = modulus value + // Q3 = modinv + + MONTGOMERY18 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q2 + (0 * 8), Q0, Q5, QH + MONTGOMERY18 Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q3, Q1, Q2 + (1 * 8), Q0, Q5, QH + MONTGOMERY18 Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q3, Q1, Q2 + (2 * 8), Q0, Q5, QH + MONTGOMERY18 Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q3, Q1, Q2 + (3 * 8), Q0, Q5, QH + MONTGOMERY18 Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q3, Q1, Q2 + (4 * 8), Q0, Q5, QH + MONTGOMERY18 Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q3, Q1, Q2 + (5 * 8), Q0, Q5, QH + MONTGOMERY18 Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q3, Q1, Q2 + (6 * 8), Q0, Q5, QH + MONTGOMERY18 Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q1, Q2 + (7 * 8), Q0, Q5, QH + + // Q6 - Q13 = carry from multiply-add + // Q2[0..7] = Montgomery factors + + mov Q3, Q2 // factor to multiply by + add Q1, 64 + add Q2, 64 + + // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13 8-word carry + // Q0, Q5 temps for multiplication + // Q1 running pointer pMod inner loop + // Q2 running pointer pSrc inner loop + // Q3 Montgomery factors for this row + // D4 loop ctr + // QH fixed input reg for multiplication + + MULADD88 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q2, Q1, Q3, Q0, Q5, QH + // pre & post: Cy = Ov = 0 + // Q13..Q6:Q2[7-0] = R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0] + // QH is volatile + + add Q1, 64 + add Q2, 64 + + // We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry + // We also saved a 1-bit carry from the previous outer loop in D4 + // move carry into Cy flag + neg D4 + mov D4, 0 + + // We do this in separate instructions to help the instruction decoder build up a lead... + mov Q0, [Q2 + 0 * 8] + adc Q0, Q6 + mov [Q2 + 0 * 8], Q0 + + mov Q5, [Q2 + 1 * 8] + adc Q5, Q7 + mov [Q2 + 1 * 8], Q5 + + mov Q0, [Q2 + 2 * 8] + adc Q0, Q8 + mov [Q2 + 2 * 8], Q0 + + mov Q5, [Q2 + 3 * 8] + adc Q5, Q9 + mov [Q2 + 3 * 8], Q5 + + mov Q0, [Q2 + 4 * 8] + adc Q0, Q10 + mov [Q2 + 4 * 8], Q0 + + mov Q5, [Q2 + 5 * 8] + adc Q5, Q11 + mov [Q2 + 5 * 8], Q5 + + mov Q0, [Q2 + 6 * 8] + adc Q0, Q12 + mov [Q2 + 6 * 8], Q0 + + mov Q5, [Q2 + 7 * 8] + adc Q5, Q13 + mov [Q2 + 7 * 8], Q5 + + adc D4, D4 // D4 = carry (D4 was previously zero) + + sub Q2, 64 // Q2 = tmp buffer that we will reduce (64B are now zeroed) + sub Q1, 128 // Q1 = modulus value + + mov D0, [rsp + GET_MEMSLOT_OFFSET(slot1)] + sub D0, 1 + mov [rsp + GET_MEMSLOT_OFFSET(slot1)], D0 + + jnz SymCryptFdefMontgomeryReduceMulx1024OuterLoop + + // D4 = output carry + + mov Q3, [rsp + GET_MEMSLOT_OFFSET(slot0)] + + // Q2 = result buffer pointer + // Q1 = modulus value + // Q3 = Dst + + // pDst = Reduction result - Modulus + + mov Q0,[Q2 + 0 * 8] + sbb Q0,[Q1 + 0 * 8] + mov [Q3 + 0 * 8], Q0 + + mov Q5,[Q2 + 1 * 8] + sbb Q5,[Q1 + 1 * 8] + mov [Q3 + 1 * 8], Q5 + + mov Q0,[Q2 + 2 * 8] + sbb Q0,[Q1 + 2 * 8] + mov [Q3 + 2 * 8], Q0 + + mov Q5,[Q2 + 3 * 8] + sbb Q5,[Q1 + 3 * 8] + mov [Q3 + 3 * 8], Q5 + + mov Q0,[Q2 + 4 * 8] + sbb Q0,[Q1 + 4 * 8] + mov [Q3 + 4 * 8], Q0 + + mov Q5,[Q2 + 5 * 8] + sbb Q5,[Q1 + 5 * 8] + mov [Q3 + 5 * 8], Q5 + + mov Q0,[Q2 + 6 * 8] + sbb Q0,[Q1 + 6 * 8] + mov [Q3 + 6 * 8], Q0 + + mov Q5,[Q2 + 7 * 8] + sbb Q5,[Q1 + 7 * 8] + mov [Q3 + 7 * 8], Q5 + + mov Q0,[Q2 + 8 * 8] + sbb Q0,[Q1 + 8 * 8] + mov [Q3 + 8 * 8], Q0 + + mov Q5,[Q2 + 9 * 8] + sbb Q5,[Q1 + 9 * 8] + mov [Q3 + 9 * 8], Q5 + + mov Q0,[Q2 + 10 * 8] + sbb Q0,[Q1 + 10 * 8] + mov [Q3 + 10 * 8], Q0 + + mov Q5,[Q2 + 11 * 8] + sbb Q5,[Q1 + 11 * 8] + mov [Q3 + 11 * 8], Q5 + + mov Q0,[Q2 + 12 * 8] + sbb Q0,[Q1 + 12 * 8] + mov [Q3 + 12 * 8], Q0 + + mov Q5,[Q2 + 13 * 8] + sbb Q5,[Q1 + 13 * 8] + mov [Q3 + 13 * 8], Q5 + + mov Q0,[Q2 + 14 * 8] + sbb Q0,[Q1 + 14 * 8] + mov [Q3 + 14 * 8], Q0 + + mov Q5,[Q2 + 15 * 8] + sbb Q5,[Q1 + 15 * 8] + mov [Q3 + 15 * 8], Q5 + + // now a masked copy from the reduction buffer to the destination. + // copy if high carry = 0 and Cy = 1 + sbb D4, 0 + // D4 = copy mask, ff...ff if copy, 0 of no copy + + movd xmm0, D4 // xmm0[0] = mask + pcmpeqd xmm1, xmm1 // xmm1 = ff...ff + pshufd xmm0, xmm0, 0 // xmm0[0..3] = mask + pxor xmm1, xmm0 // xmm1 = not Mask + + + movdqa xmm2, [Q2 + 0 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 0 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 0 * 16], xmm2 + + movdqa xmm2, [Q2 + 1 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 1 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 1 * 16], xmm2 + + movdqa xmm2, [Q2 + 2 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 2 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 2 * 16], xmm2 + + movdqa xmm2, [Q2 + 3 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 3 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 3 * 16], xmm2 + + movdqa xmm2, [Q2 + 4 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 4 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 4 * 16], xmm2 + + movdqa xmm2, [Q2 + 5 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 5 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 5 * 16], xmm2 + + movdqa xmm2, [Q2 + 6 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 6 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 6 * 16], xmm2 + + movdqa xmm2, [Q2 + 7 * 16] // xmm2 = pSrc[0] + movdqa xmm3, [Q3 + 7 * 16] // xmm3 = pDst[0] + pand xmm2, xmm0 + pand xmm3, xmm1 + por xmm2, xmm3 + movdqa [Q3 + 7 * 16], xmm2 + +MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceMulx1024) + +FILE_END() diff --git a/lib/amd64/sha1asm.asm b/lib/amd64/sha1asm.asm deleted file mode 100644 index 526c3c1..0000000 --- a/lib/amd64/sha1asm.asm +++ /dev/null @@ -1,423 +0,0 @@ -; -; Sha1Asm.Asm -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; - -; -; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm. -; for the x64 processor architecture. -; -; This implementation is derived from the 32-bit one, which in turn is derived -; from an older one by Scott Field and Dan Shumow. -; - -include ksamd64.inc - - TITLE sha1asm.asm - - ; - ; The four round constants used by SHA-1 - ; - -K0_19 EQU 05a827999H -K20_39 EQU 06ed9eba1H -K40_59 EQU 08f1bbcdcH -K60_79 EQU 0ca62c1d6H - - -;VOID -;SYMCRYPT_CALL -;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H, -; _In_reads_bytes_( cbData ) PCBYTE pbData, -; SIZE_T cbData ) -; - - ; - ; This function allocates stack space, so it is not a LEAF function - ; but a nested one. - ; - NESTED_ENTRY SymCryptSha1AppendBlocksAsm, _TEXT - -; -; To keep stack manipulations simple we define a structure and use that for all accesses. -; - -SymCryptSha1AppendBlocksFrame struct 16, NONUNIQUE -; -; To keep the RSP aligned we need (8 mod 16) bytes of local stack space. -; this is the case, so there is no need for a dummy location -; -Wbuf dd 16 dup (?) -EndAddress dq ? -SaveR12 dq ? -SaveR13 dq ? -SaveR14 dq ? -SaveR15 dq ? -SaveRdi dq ? -SaveRsi dq ? -SaveRbp dq ? -SaveRbx dq ? -ReturnAddress dq ? -CallerP1Home dq ? -CallerP2Home dq ? -CallerP3Home dq ? -CallerP4Home dq ? - -SymCryptSha1AppendBlocksFrame ends - - ; - ; We use the W buffer extensively; this is a shorthand for the base address - ; -W equ rsp+SymCryptSha1AppendBlocksFrame.Wbuf - - - - ; - ; Set up our stack frame and save non-volatile registers - ; - rex_push_reg rbx - push_reg rbp - push_reg rsi - push_reg rdi - push_reg r15 - push_reg r14 - push_reg r13 - push_reg r12 - alloc_stack SymCryptSha1AppendBlocksFrame.SaveR12 - - END_PROLOGUE - - ; - ;Register allocation: - ; - ;5 registers for state - ;2 scratch - ;6 registers for W[t-1], W[t-2], W[t-3], W[t-14], W[t-15], W[t-16] - ;1 for data pointer - ;1 for H pointer - ; - ; - ; To allow macro re-ordering of our registers we use symbolic names - ; for the registers. - ; s0-s4 are the 5 state registers. x1 and x2 are extra scratch registers. - ; w0-w5 contain the W state cache - ; - ; Note: some other code puts the right value in the right register and - ; has to be updated if this mapping is changed. - ; - ; a is in register (round % 5) - ; b is in register (round+4 % 5) - ; c is in register (round+3 % 5) - ; d is in register (round+2 % 5) - ; e is in register (round+1 % 5) - ; This way, if round is incremented we move a->b, b->c, c->d, d->e, and e->a - ; For optimization the actual value of a is in scratch register x1 at the start of each round - ; - ; W[t- 1] is in register (round % 6) - ; W[t- 2] is in register (round+5 % 6) - ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round) - ; W[t-14] is in register (round+3 % 6) - ; W[t-15] is in register (round+2 % 6) - ; W[t-16] is in register (round+1 % 6) - ; If round is incremented the values all appear in their right place. - -s0 EQU eax -s1 EQU ebx -s2 EQU ecx -s3 EQU edx -s4 EQU esi - -w0 EQU r9d -w1 EQU r10d -w2 EQU r11d -w3 EQU r12d -w4 EQU r13d -w5 EQU r14d - -x1 EQU ebp ; screatch 1 -x2 EQU edi ; scratch 2 - -dataPtr EQU r8 ; Points to data buffer -HPtr EQU r15 ; Points to H - - - ; At this point: - ; rcx = H - ; rdx = pbData - ; r8 = cbData - ; - ; compute the end address, address of byte after last block we will process - ; This code ensures that we never exceed the data buffer we were given, - ; although we silently round the cbData parameter down to the next - ; multiple of 64. - ; Do nothing if no blocks need to be processed. - ; - and r8,NOT 3fh ; round down to multiple of 64 - jz SymCryptSha1AppendBlocksDone - add r8,rdx ; pbData + (cbData & 0x3f) - mov [rsp+SymCryptSha1AppendBlocksFrame.EndAddress], r8 - - mov dataPtr,rdx - mov Hptr,rcx - - ; - ; Load the H state, note that the a value lives in x1 at the round code boundary - ; - mov x1,[Hptr ] - mov s4,[Hptr+ 4] - mov s3,[Hptr+ 8] - mov s2,[Hptr+12] - mov s1,[Hptr+16] - - -SymCryptSha1AppendBlocksLoop: - ; - ; This is the main loop. We process 64 bytes in each iteration. - ; - ; Most of the code in the loop is generated through macros using parameters to - ; rename the registers. - ; - -ROUND_CH_0_15 MACRO round,sa,sb,sc,sd,se,wt,x1,x2 - ; - ; Code for round 0-15. - ; This code loads data from the data buffer & BSWAPs the data to get it into the - ; right form. - ; - ; Parameters: - ; round round number - ; sa register that will contain the a value - ; sb register that contains the b value - ; sc register that contains the c value - ; sd register that contains the d value - ; se register that contains the e value - ; x1 scratch, contains the a value on entry - ; x2 scratch register. - ; wt register loaded with Wt - ; - ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register. - ; We start with the d value as that is the oldest value and available the first - ; - ; See FIPS 180-2 for our symbolic notation. - ; - mov x2,sd ; x2 = d - mov wt,[dataPtr+4*round] ; Fetch word from message - mov sa, x1 ; put a in the correct register - - bswap wt ; wt = Wt - xor x2,sc ; x2 = (d ^ c) - rol x1,5 ; x1 = ROL(a,5) - - add se,wt ; se = e + Wt - and x2,sb ; x2 = ((d ^ c) & b) - mov [W + 4*round],wt ; Store in W buffer for future use - ror sb,2 ; sb = ROL( b, 30 ) - - add se,x1 ; se = e + Wt + ROL(a,5) - xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d) - - lea x1,[se+x2+K0_19] ; x1 = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt - - ENDM - -MSG_EXP MACRO round, se, wa, wb, wc - ; round round number - ; se register of state to add expanded message word to - ; wa register of W[round-16], will be updated to contain W[round] - ; wb register of W[round-14] - ; wc register of W[round- 3], will be loaded with W[round-13] - - xor wc, wb ; wc = W[t-3] ^ W[t-14] - xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8] - xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] - rol wa,1 ; wa = Wt - IF round LT (80 - 1) - ; do not load wc with W[t-13] in the last round; it will not be needed - mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13] - ENDIF - add se,wa ; re = e + Wt - IF round LT (80 - 8) - ; don't store Wt in the last 8 rounds. The value would never be used - mov [W+4*(round MOD 16)], wa; Store Wt - ENDIF - ENDM - -ROUND_CH MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2 - ; - ; See ROUND_CH_0_15 for most parameters. - ; x1 and x2 are both scratch registers - ; wa register of W[round-16], will be updated to contain W[round] - ; wb register of W[round-14] - ; wc register of W[round- 3], will be loaded with W[round-13] - ; - - xor wc, wb ; wc = W[t-3] ^ W[t-14] - xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8] - xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] - rol wa,1 ; wa = Wt - mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13] - add se,wa ; re = e + Wt - mov [W+4*(round MOD 16)], wa ; Store Wt - - mov sa, x1 ; put a in the correct register - mov x2,sd ; x2 = d - rol x1,5 ; x1 = ROL(a,5) - xor x2,sc ; x2 = (d ^ c) - add se,x1 ; re = e + Wt + ROL(a,5) - and x2,sb ; x2 = ((d ^ c) & b) - ror sb,2 ; rb = ROL( b, 30 ) - xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d) - lea x1,[se+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt - ENDM - -ROUND_PARITY MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2, K - ; - ; See ROUND_CH for most parameters - ; K is the round constant to use. - ; - ; The order of xorring the registers b, c, and d is driven by the data dependency graph. - ; We start with d (the oldest) and then do b to unblock the subsequent rotate - ; - MSG_EXP round, se, wa, wb, wc ; re = e + Wt - - mov sa,x1 ; store a value in right register - rol x1,5 ; x1 = ROL(a,5) - add se,x1 ; re = e + Wt + ROL(a,5) - - mov x2,sd ; x1 = d - xor x2,sb ; x1 = (d ^ b) - xor x2,sc ; x1 = (d ^ b ^ c) = Parity(b,c,d) - ror sb,2 ; rb = ROL( b, 30 ) - lea x1,[se+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt - - ENDM - -ROUND_MAJ MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2 - ; - ; See above for parameter explanation - ; - MSG_EXP round, se, wa, wb, wc ; re = e + Wt - - mov sa,x1 ; store a value in right register - rol x1,5 ; x1 = ROL(a,5) - add se,x1 ; re = e + ROL(a,5) - mov x1,sd ; x1 = d - or x1,sc ; x1 = (d | c) - and x1,sb ; x1 = ((d | c) & b) - - mov x2,sc ; x2 = c - and x2,sd ; x2 = (c & d) - or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d) - - ror sb,2 ; rb = ROL( b, 30 ) - - lea x1,[se+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt - ENDM - - - ; - ; With these macros we can now produce the actual code. - ; Note the use of the % operator which evaluates the expression and yields the result as text. - ; Together with the macros and the r EQUs this provides us with automatic register renaming - ; for each round. - ; - ; The first 16 rounds are more complicated as we need to use the right registers to load the msg in - ; so we do those by hand - ; - ; W[t- 1] is in register (round % 6) - ; W[t- 2] is in register (round+5 % 6) - ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round) - ; W[t-14] is in register (round+3 % 6) - ; W[t-15] is in register (round+2 % 6) - ; W[t-16] is in register (round+1 % 6) - ; - ROUND_CH_0_15 0, s0, s4, s3, s2, s1, w5, x1, x2 ;W[t-16] for t=16 is in w5 - ROUND_CH_0_15 1, s1, s0, s4, s3, s2, w0, x1, x2 ;W[t-15] for t=16 is in w0 - ROUND_CH_0_15 2, s2, s1, s0, s4, s3, w1, x1, x2 ;W[t-14] for t=16 is in w1 - ROUND_CH_0_15 3, s3, s2, s1, s0, s4, w3, x1, x2 ; - ROUND_CH_0_15 4, s4, s3, s2, s1, s0, w4, x1, x2 ; - ROUND_CH_0_15 5, s0, s4, s3, s2, s1, w3, x1, x2 ; - ROUND_CH_0_15 6, s1, s0, s4, s3, s2, w4, x1, x2 ; - ROUND_CH_0_15 7, s2, s1, s0, s4, s3, w3, x1, x2 ; - ROUND_CH_0_15 8, s3, s2, s1, s0, s4, w4, x1, x2 ; - ROUND_CH_0_15 9, s4, s3, s2, s1, s0, w3, x1, x2 ; - ROUND_CH_0_15 10, s0, s4, s3, s2, s1, w4, x1, x2 ; - ROUND_CH_0_15 11, s1, s0, s4, s3, s2, w3, x1, x2 ; - ROUND_CH_0_15 12, s2, s1, s0, s4, s3, w4, x1, x2 ; - ROUND_CH_0_15 13, s3, s2, s1, s0, s4, w2, x1, x2 ;W[t-3] for t=16 is in w2 - ROUND_CH_0_15 14, s4, s3, s2, s1, s0, w3, x1, x2 ;W[t-2] for t=16 is in w3 - ROUND_CH_0_15 15, s0, s4, s3, s2, s1, w4, x1, x2 ;W[t-1] for t=16 is in w4 - - - FOR t, <16, 17, 18, 19> - ROUND_CH t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2 - ENDM - - FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39> - ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K20_39 - ENDM - - FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59> - ROUND_MAJ t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2 - ENDM - - FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79> - ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K60_79 - ENDM - - ; - ; Now we update the state, & the dataPtr - ; - add x1,[Hptr ] - add s4,[Hptr+ 4] - add dataPtr,64 - add s3,[Hptr+ 8] - add s2,[Hptr+12] - add s1,[Hptr+16] - - mov [Hptr ], x1 - mov [Hptr+ 4], s4 - cmp dataPtr,[rsp+SymCryptSha1AppendBlocksFrame.EndAddress] ; Loop terminating condition - mov [Hptr+ 8], s3 - mov [Hptr+12], s2 - mov [Hptr+16], s1 - - jc SymCryptSha1AppendBlocksLoop ; Main loop - - ; - ; We're done processing the blocks. The result is already in the state, so all we have to do - ; is clean up. - ; - ; Wipe the W buffer - ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read. - ; - mov rcx,64 - xor rax,rax -@@: sub ecx,16 - mov [rsp+rcx ],rax - mov [rsp+rcx+8],rax - jnz @B - -SymCryptSha1AppendBlocksDone: - - - add rsp, SymCryptSha1AppendBlocksFrame.SaveR12 - - BEGIN_EPILOGUE - pop r12 - pop r13 - pop r14 - pop r15 - pop rdi - pop rsi - pop rbp - pop rbx - - ret - - NESTED_END SymCryptSha1AppendBlocksAsm, _TEXT - -END - diff --git a/lib/amd64/symcrypt_magic.inc b/lib/amd64/symcrypt_magic.inc deleted file mode 100644 index 7dbdaaa..0000000 --- a/lib/amd64/symcrypt_magic.inc +++ /dev/null @@ -1,37 +0,0 @@ -; -; SymCrypt_magic.inc -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; Include file to define the support macros for the Magic field -; - - extern SymCryptFatal:NEAR - - -SYMCRYPT_MAGIC_FIELD MACRO - - if DBG - magic dq ? - endif - - ENDM - -SYMCRYPT_CODE_VERSION EQU ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR ) -SYMCRYPT_MAGIC_CONSTANT EQU ('S1mv' + SYMCRYPT_CODE_VERSION) - -SYMCRYPT_CHECK_MAGIC MACRO ptr, struct_name - - if DBG - - mov rax, [ptr + struct_name.magic] - sub rax, ptr - cmp rax, SYMCRYPT_MAGIC_CONSTANT - jz @F - mov ecx, 'magc' - call SymCryptFatal -@@: - endif - - ENDM - - diff --git a/lib/amd64/wipe.asm b/lib/amd64/wipe.asm deleted file mode 100644 index 19175f0..0000000 --- a/lib/amd64/wipe.asm +++ /dev/null @@ -1,171 +0,0 @@ -; -; Wipe.asm -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; - -include ksamd64.inc - - TITLE wipe.asm - -;VOID -;SYMCRYPT_CALL -;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData, -; SIZE_T cbData ) - - ; - ; This function allocates no stack space, calls no functions, and does not save - ; any non-volatile registers. Thusm it is a LEAF function - ; - LEAF_ENTRY SymCryptWipeAsm, _TEXT - - ; rcx = pbData - ; rdx = cbData - - ; - ; This function will handle any alignment of pbData and any size, but it is optimized for - ; the case where the start and end of the buffer are 16-aligned. - ; 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple - ; of 16 long without adding too much slack. - ; The cost of non-alignment is relatively low, in the order of 5 cycles or so - ; - - xorps xmm0,xmm0 ; Zero register for 16-byte wipes - cmp rdx,16 - jb SymCryptWipeAsmSmall ; if cbData < 16, this is a rare case - - test rcx,15 - jnz SymCryptWipeAsmUnAligned; if data pointer is unaligned, we jump to the code that aligns the pointer - ; For well-optimized callers the aligned case is the common one, and that is - ; the fall-through. - -SymCryptWipeAsmAligned: - ; - ; Here rcx is aligned, and rdx contains the # bytes left to wipe, and rdx >= 16 - ; - ; Our loop wipes in 32-byte increments; we always wipe the first 16 bytes if - ; and increment the pbData pointer if cbData is 16 mod 32 - ; This avoids a conditional jump and is faster. - ; - test rdx,16 - movaps [rcx],xmm0 ; it is safe to always wipe as cbData >= 16 - lea r8,[rcx+16] - cmovnz rcx,r8 ; only increment pbData if cbData = 16 mod 32 - - sub rdx,32 ; see if we have >= 32 bytes to wipe - jc SymCryptWipeAsmTailOptional ; if not, wipe tail, or nothing if cbData = 0 mod 16 - - align 16 - -SymCryptWipeAsmLoop: - movaps [rcx],xmm0 - movaps [rcx+16],xmm0 ; Wipe 32 bytes - add rcx,32 - sub rdx,32 - jnc SymCryptWipeAsmLoop - -SymCryptWipeAsmTailOptional: - ; only the lower 4 bits of rdx are valid, we have subtracted too much already. - ; The wipe was at least 16 bytes, so we can just wipe the tail in one instruction - - and edx,15 - jnz SymCryptWipeAsmTail - ret - -SymCryptWipeAsmTail: - ; This code appears also below at the end of the unaligned wiping routine - ; but making the jnz jump further is slower and we only duplicate 4 instructions. - xor eax,eax - mov [rcx+rdx-16],rax - mov [rcx+rdx-8],rax - ret - - align 4 -SymCryptWipeAsmUnaligned: - - ; - ; At this point we know that cbData(rdx) >= 16 and pbData(rcx) is unaligned. - ; We can wipe 16 bytes and move to an aligned position - ; - xor eax,eax - mov [rcx],rax - mov [rcx+8],rax - - mov eax,ecx ; - neg eax ; lower 4 bits of eax = # bytes to wipe to reach alignment - and eax,15 - add rcx,rax - sub rdx,rax - - ; - ; If rdx > 16, go to the aligned wiping loop - ; - cmp rdx,16 - jae SymCryptWipeAsmAligned ; if cbData >= 16, do aligned wipes - - ; - ; We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes. - ; We just wipe the last 16 bytes completely. - ; - xor eax,eax - mov [rcx+rdx-16],rax - mov [rcx+rdx-8],rax - ret - - - align 8 -SymCryptWipeAsmSmall: - ; rcx = pbData, possibly unaligned - ; rdx = cbData; rdx < 16 - ; - ; With speculative execution attacks, the cost of a jump table is prohibitive. - ; We use a compare ladder for 5 cases: - ; 8-15 bytes - ; 4-7 bytes - ; 2-3 bytes - ; 1 byte - ; 0 bytes - - xor eax,eax - - cmp edx, 8 - jb SymCryptWipeAsmSmallLessThan8 - - ; wipe 8-15 bytes using two possibly overlapping writes - mov [rcx], rax - mov [rcx + rdx - 8], rax - ret - -SymCryptWipeAsmSmallLessThan8: - cmp edx, 4 - jb SymCryptWipeAsmSmallLessThan4 - - ; wipe 4-7 bytes - mov [rcx], eax - mov [rcx + rdx - 4], eax - ret - -SymCryptWipeAsmSmallLessThan4: - cmp edx, 2 - jb SymCryptWipeAsmSmallLessThan2 - - ; wipe 2-3 bytes - mov [rcx], ax - mov [rcx + rdx - 2], ax - ret - -SymCryptWipeAsmSmallLessThan2: - or edx, edx - jz SymCryptWipeAsmSmallDone - - ; wipe 1 byte - mov [rcx], al - -SymCryptWipeAsmSmallDone: - - ret - - LEAF_END SymCryptWipeAsm, _TEXT - -END - diff --git a/lib/amd64/wipe.symcryptasm b/lib/amd64/wipe.symcryptasm new file mode 100644 index 0000000..22225b0 --- /dev/null +++ b/lib/amd64/wipe.symcryptasm @@ -0,0 +1,165 @@ +// +// wipe.symcryptasm Assembler code for wiping a buffer +// Expresses asm in a generic enough way to enable generation of MASM and GAS using the +// symcryptasm_processor.py script and C preprocessor +// +// Copyright (c) Microsoft Corporation. Licensed under the MIT license. + + +#include "symcryptasm_shared.cppasm" + +//VOID +//SYMCRYPT_CALL +//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData, +// SIZE_T cbData ) + +FUNCTION_START(SymCryptWipeAsm, 2, 4) + + // Q1 = pbData + // Q2 = cbData + + // + // This function will handle any alignment of pbData and any size, but it is optimized for + // the case where the start and end of the buffer are 16-aligned. + // 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple + // of 16 long without adding too much slack. + // The cost of non-alignment is relatively low, in the order of 5 cycles or so + // + + xorps xmm0,xmm0 // Zero register for 16-byte wipes + cmp Q2,16 + jb SymCryptWipeAsmSmall // if cbData < 16, this is a rare case + + test Q1,15 + jnz SymCryptWipeAsmUnaligned // if data pointer is unaligned, we jump to the code that aligns the pointer + // For well-optimized callers the aligned case is the common one, and that is + // the fall-through. + +SymCryptWipeAsmAligned: + // + // Here Q1 is aligned, and Q2 contains the # bytes left to wipe, and Q2 >= 16 + // + // Our loop wipes in 32-byte increments; we always wipe the first 16 bytes + // and increment the pbData pointer if cbData is 16 mod 32 + // This avoids a conditional jump and is faster. + // + test Q2,16 + movaps [Q1],xmm0 // it is safe to always wipe as cbData >= 16 + lea Q3,[Q1+16] + cmovnz Q1,Q3 // only increment pbData if cbData = 16 mod 32 + + sub Q2,32 // see if we have >= 32 bytes to wipe + jc SymCryptWipeAsmTailOptional // if not, wipe tail, or nothing if cbData = 0 mod 16 + +ALIGN(16) + +SymCryptWipeAsmLoop: + movaps [Q1],xmm0 + movaps [Q1+16],xmm0 // Wipe 32 bytes + add Q1,32 + sub Q2,32 + jnc SymCryptWipeAsmLoop + +SymCryptWipeAsmTailOptional: + // only the lower 4 bits of Q2 are valid, we have subtracted too much already. + // The wipe was at least 16 bytes, so we can just wipe the tail with 2 instructions + + and D2,15 + jnz SymCryptWipeAsmTail + ret + +SymCryptWipeAsmTail: + // This code appears also below at the end of the unaligned wiping routine + // but making the jnz jump further is slower and we only duplicate 4 instructions. + xor D0,D0 + mov [Q1+Q2-16],Q0 + mov [Q1+Q2-8],Q0 + ret + +ALIGN(4) + +SymCryptWipeAsmUnaligned: + + // + // At this point we know that cbData(Q2) >= 16 and pbData(Q1) is unaligned. + // We can wipe 16 bytes and move to an aligned position + // + xor D0,D0 + mov [Q1],Q0 + mov [Q1+8],Q0 + + mov D0,D1 + neg D0 // lower 4 bits of D0 = # bytes to wipe to reach alignment + and D0,15 + add Q1,Q0 + sub Q2,Q0 + + // + // If Q2 > 16, go to the aligned wiping loop + // + cmp Q2,16 + jae SymCryptWipeAsmAligned // if cbData >= 16, do aligned wipes + + // + // We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes. + // We just wipe the last 16 bytes completely. + // + xor D0,D0 + mov [Q1+Q2-16],Q0 + mov [Q1+Q2-8],Q0 + ret + +ALIGN(8) + +SymCryptWipeAsmSmall: + // Q1 = pbData, possibly unaligned + // Q2 = cbData; Q2 < 16 + // + // With speculative execution attacks, the cost of a jump table is prohibitive. + // We use a compare ladder for 5 cases: + // 8-15 bytes + // 4-7 bytes + // 2-3 bytes + // 1 byte + // 0 bytes + + xor D0,D0 + + cmp D2, 8 + jb SymCryptWipeAsmSmallLessThan8 + + // wipe 8-15 bytes using two possibly overlapping writes + mov [Q1],Q0 + mov [Q1+Q2-8],Q0 + ret + +SymCryptWipeAsmSmallLessThan8: + cmp D2, 4 + jb SymCryptWipeAsmSmallLessThan4 + + // wipe 4-7 bytes + mov [Q1],D0 + mov [Q1+Q2-4],D0 + ret + +SymCryptWipeAsmSmallLessThan4: + cmp D2, 2 + jb SymCryptWipeAsmSmallLessThan2 + + // wipe 2-3 bytes + mov [Q1],W0 + mov [Q1+Q2-2],W0 + ret + +SymCryptWipeAsmSmallLessThan2: + or D2,D2 + jz SymCryptWipeAsmSmallDone + + // wipe 1 byte + mov [Q1],B0 + +SymCryptWipeAsmSmallDone: + +FUNCTION_END(SymCryptWipeAsm) + +FILE_END() diff --git a/lib/arm/fdef_asm.asm b/lib/arm/fdef_asm.asm index 0d6e459..e514928 100644 --- a/lib/arm/fdef_asm.asm +++ b/lib/arm/fdef_asm.asm @@ -9,7 +9,11 @@ #include "symcrypt_version.inc" #include "symcrypt_magic.inc" +; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants +; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm. +#define SYMCRYPT_MASM #include "C_asm_shared.inc" +#undef SYMCRYPT_MASM ; A digit consists of 4 words of 32 bits each @@ -449,11 +453,11 @@ SymCryptFdefRawSquareAsmInnerLoopInit_Word1 SQR_SINGLEADD_32 3 - + add r2, r2, #16 add r4, r4, #16 - adds r3, r3, #1 ; move one digit up + adds r3, r3, #1 ; move one digit up bne SymCryptFdefRawSquareAsmInnerLoopInit_Word0 str r11, [r4] ; Store the next word into the destination @@ -689,7 +693,7 @@ SymCryptFdefMontgomeryReduceAsmInner adds r11, r11, r7 ; c + pSrc[nWords] + hc adc r8, r8, #0 ; Add the carry if any str r11, [r1], #4 ; pSrc[nWords] = c - + adds r12, r12, r6 ; c + pSrc[nWords+1] adc r9, r9, #0 ; Add the carry if any adds r12, r12, r8 ; c + pSrc[nWords] + hc @@ -701,7 +705,7 @@ SymCryptFdefMontgomeryReduceAsmInner add r2, r2, #8 ; Move stored pSrc pointer two words up ldr r0, [sp, #pMod] ; Restore the pMod pointer mov r1, r2 ; Restore the pSrc pointer - + bne SymCryptFdefMontgomeryReduceAsmOuter ; diff --git a/lib/arm64/fdef369_asm.asm b/lib/arm64/fdef369_asm.asm index bb75673..11d4985 100644 --- a/lib/arm64/fdef369_asm.asm +++ b/lib/arm64/fdef369_asm.asm @@ -16,7 +16,11 @@ #include "symcrypt_name_mangling.inc" #include "symcrypt_magic.inc" +; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants +; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm. +#define SYMCRYPT_MASM #include "C_asm_shared.inc" +#undef SYMCRYPT_MASM ; A digit consists of 3 words of 64 bits each @@ -213,7 +217,7 @@ SymCryptFdef369RawMulAsmLoopInner1 adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added) umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2] str x12, [x4], #8 ; Store to destination - + cbnz x3, SymCryptFdef369RawMulAsmLoopInner1 adc x15, x15, XZR ; Store the next word into the destination (with the carry if any) diff --git a/lib/arm64/fdef_asm.asm b/lib/arm64/fdef_asm.asm index 5dcd9c4..8897ba9 100644 --- a/lib/arm64/fdef_asm.asm +++ b/lib/arm64/fdef_asm.asm @@ -10,7 +10,11 @@ #include "symcrypt_name_mangling.inc" #include "symcrypt_magic.inc" +; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants +; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm. +#define SYMCRYPT_MASM #include "C_asm_shared.inc" +#undef SYMCRYPT_MASM ; A digit consists of 4 words of 64 bits each diff --git a/lib/fdef_general.c b/lib/fdef_general.c index 4c4383a..0a79476 100644 --- a/lib/fdef_general.c +++ b/lib/fdef_general.c @@ -517,11 +517,11 @@ SymCryptFdefIntSetValueUint64( SYMCRYPT_ERROR SYMCRYPT_CALL SymCryptFdefRawSetValue( - _In_reads_bytes_(cbSrc) PCBYTE pbSrc, - SIZE_T cbSrc, - SYMCRYPT_NUMBER_FORMAT format, - _Out_writes_(nWords) PUINT32 pDst, - UINT32 nDigits ) + _In_reads_bytes_(cbSrc) PCBYTE pbSrc, + SIZE_T cbSrc, + SYMCRYPT_NUMBER_FORMAT format, + _Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst, + UINT32 nDigits ) { SYMCRYPT_ERROR scError; UINT32 b; @@ -611,11 +611,11 @@ SymCryptFdefIntSetValue( SYMCRYPT_ERROR SYMCRYPT_CALL SymCryptFdefRawGetValue( - _In_reads_(nWords) PCUINT32 pSrc, - UINT32 nDigits, - _Out_writes_bytes_(cbBytes) PBYTE pbDst, - SIZE_T cbDst, - SYMCRYPT_NUMBER_FORMAT format ) + _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, + UINT32 nDigits, + _Out_writes_bytes_(cbBytes) PBYTE pbDst, + SIZE_T cbDst, + SYMCRYPT_NUMBER_FORMAT format ) { SYMCRYPT_ERROR scError; UINT32 b; diff --git a/lib/fdef_int.c b/lib/fdef_int.c index dd5d636..7fbbb23 100644 --- a/lib/fdef_int.c +++ b/lib/fdef_int.c @@ -722,11 +722,11 @@ SymCryptFdefIntSquare( VOID SYMCRYPT_CALL SymCryptFdefRawMulC( - _In_reads_(nWords1) PCUINT32 pSrc1, - UINT32 nDigits1, - _In_reads_(nWords2) PCUINT32 pSrc2, - UINT32 nDigits2, - _Out_writes_(nWords1 + nWords2) PUINT32 pDst ) + _In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1, + UINT32 nDigits1, + _In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2, + UINT32 nDigits2, + _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) { UINT32 nWords1 = nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32; UINT32 nWords2 = nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32; @@ -778,9 +778,9 @@ SymCryptFdefRawMul( VOID SYMCRYPT_CALL SymCryptFdefRawSquareC( - _In_reads_(nWords) PCUINT32 pSrc, - UINT32 nDigits, - _Out_writes_(2*nWords) PUINT32 pDst ) + _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc, + UINT32 nDigits, + _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst ) { UINT32 nWords = nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32; diff --git a/lib/fdef_mod.c b/lib/fdef_mod.c index 3e3b7f0..07b6e43 100644 --- a/lib/fdef_mod.c +++ b/lib/fdef_mod.c @@ -1223,7 +1223,7 @@ SymCryptFdefModMulMontgomery( SymCryptFdefMontgomeryReduce( pmMod, pTmp, &peDst->d.uint32[0] ); } -#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC +#if SYMCRYPT_CPU_AMD64 VOID SYMCRYPT_CALL SymCryptFdefModMulMontgomeryMulx( @@ -1283,7 +1283,7 @@ SymCryptFdefModSquareMontgomery( } -#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC +#if SYMCRYPT_CPU_AMD64 VOID SYMCRYPT_CALL SymCryptFdefModSquareMontgomeryMulx( @@ -1356,70 +1356,12 @@ SymCryptFdefModInvMontgomery( return scError; } -#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC +#if SYMCRYPT_CPU_AMD64 //===================================== // 256-bit Montgomery modulus code // -VOID -SYMCRYPT_CALL -SymCryptFdefModAdd256Test( - _In_ PCSYMCRYPT_MODULUS pmMod, - _In_ PCSYMCRYPT_MODELEMENT peSrc1, - _In_ PCSYMCRYPT_MODELEMENT peSrc2, - _Out_ PSYMCRYPT_MODELEMENT peDst, - _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, - SIZE_T cbScratch ) -{ - SYMCRYPT_ASYM_ALIGN BYTE buf1[128]; - SYMCRYPT_ASYM_ALIGN BYTE buf2[128]; - PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod ); - PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod ); - - (VOID) peTmp1; - (VOID) peTmp2; - - SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch ); - SymCryptFdefModAddGeneric( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); - - if( memcmp( peTmp1, peTmp2, 64 ) != 0 ) - { - SymCryptFatal( 42 ); - } - - SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch ); -} - -VOID -SYMCRYPT_CALL -SymCryptFdefModMulMontgomery256Test( - _In_ PCSYMCRYPT_MODULUS pmMod, - _In_ PCSYMCRYPT_MODELEMENT peSrc1, - _In_ PCSYMCRYPT_MODELEMENT peSrc2, - _Out_ PSYMCRYPT_MODELEMENT peDst, - _Out_writes_bytes_( cbScratch ) PBYTE pbScratch, - SIZE_T cbScratch ) -{ - SYMCRYPT_ASYM_ALIGN BYTE buf1[128]; - SYMCRYPT_ASYM_ALIGN BYTE buf2[128]; - PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod ); - PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod ); - - (VOID) peTmp1; - (VOID) peTmp2; - - SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch ); - //SymCryptFdefModMulMontgomery( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); *** This doesn't produce the same result as it reduces a whole digit, not 256 bits - - if( memcmp( peTmp1, peTmp2, 64 ) != 0 ) - { - // SymCryptFatal( 42 ); - } - - SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch ); -} - VOID SYMCRYPT_CALL SymCryptFdefModSquareMontgomery256( diff --git a/lib/i386/fdef_asm.asm b/lib/i386/fdef_asm.cppasm similarity index 94% rename from lib/i386/fdef_asm.asm rename to lib/i386/fdef_asm.cppasm index 1597236..d9663ce 100644 --- a/lib/i386/fdef_asm.asm +++ b/lib/i386/fdef_asm.cppasm @@ -1,5 +1,6 @@ ; -; fdef_asm.asm Assembler code for fast arithmetic +; fdef_asm.cppasm Assembler code for fast arithmetic +; Requires C preprocessor to correctly include C_asm_shared.inc ; ; Copyright (c) Microsoft Corporation. Licensed under the MIT license. ; @@ -11,9 +12,9 @@ ; ; FPO documentation: ; The .FPO provides debugging information. -; This stuff not well documented, +; This stuff not well documented, ; but here is the information I've gathered about the arguments to .FPO -; +; ; In order: ; cdwLocals: Size of local variables, in DWords ; cdwParams: Size of parameters, in DWords. Given that this is all about @@ -23,7 +24,7 @@ ; prolog code with work for better performance. Most uses of ; .FPO seem to set this value to 0 anyway, which is what we ; will do. -; cbRegs : # registers saved in the prolog. +; cbRegs : # registers saved in the prolog. ; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer ; cbFrame : Type of frame. ; 0 = FPO frame (no frame pointer) @@ -43,7 +44,7 @@ _TEXT SEGMENT PARA PUBLIC USE32 'CODE' include symcrypt_version.inc include symcrypt_magic.inc -include C_asm_shared.inc +#include "C_asm_shared.inc" PUBLIC @SymCryptFdefRawAddAsm@16 PUBLIC @SymCryptFdefRawSubAsm@16 @@ -60,7 +61,7 @@ BEFORE_PROC MACRO ; DB 5 dup (0cch) ENDM - + @@ -86,7 +87,7 @@ pDst dd ? nDigits dd ? SymCryptFdefRawAddAsmFrame ends - + ; ecx = pSrc1 ; edx = pSrc2 @@ -129,7 +130,7 @@ SymCryptFdefRawAddAsmLoop: pop edi pop ebx ret 8 - + @SymCryptFdefRawAddAsm@16 ENDP @@ -154,7 +155,7 @@ pDst dd ? nDigits dd ? SymCryptFdefRawSubAsmFrame ends - + ; ecx = pSrc1 ; edx = pSrc2 @@ -197,7 +198,7 @@ SymCryptFdefRawSubAsmLoop: pop edi pop ebx ret 8 - + @SymCryptFdefRawSubAsm@16 ENDP @@ -305,8 +306,8 @@ SymCryptFdefRawMulAsmFrame ends ; for each word in Src1: ; Dst += Src2 * word ; Register assignments - ; - ; eax = tmp/lower half of mult + ; + ; eax = tmp/lower half of mult ; ebx = multiplicant ; ecx = loop counter, initialized to nDigits2 ; edx = upper half of mult @@ -315,7 +316,7 @@ SymCryptFdefRawMulAsmFrame ends ; ebp = carry ; ; esp + pSrc1 running pointer into Src1 - ; esp + + ; esp + mov edi,edi @@ -436,7 +437,7 @@ SymCryptFdefRawMulAsmLoop2: adc edx, 0 mov [edi + 12], eax mov ebp, edx - + add esi, 16 add edi, 16 sub ecx,1 @@ -477,7 +478,7 @@ SymCryptFdefMontgomeryReduceAsmFrame struct 4, NONUNIQUE HighCarry dd ? pSrc dd ? pModValue dd ? - nWords dd ? + nWords dd ? SaveEbp dd ? ; # words still to process in outer loop SaveEsi dd ? SaveEdi dd ? @@ -513,13 +514,13 @@ SymCryptFdefMontgomeryReduceAsmFrame ends SymCryptFdefMontgomeryReduceOuterLoop: ; eax = ; ebx = - ; ecx = + ; ecx = ; edx = ; esi = start of mod value ; edi = pSrc + 4 * loop iteration count ; ebp = - ; compute multiplier for this outer loop iteration. + ; compute multiplier for this outer loop iteration. mov ebx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusMontgomeryInv64OffsetX86 ] imul ebx, [edi] ; word we want to zero out, ebx = multiplier for this inner loop @@ -529,7 +530,7 @@ SymCryptFdefMontgomeryReduceOuterLoop: SymCryptFdefMontgomeryReduceInnerLoop: ; eax = mul scratch ; ebx = multiplier - ; ecx = digit counter + ; ecx = digit counter ; edx = mul scratch ; esi = running pointer to mod value ; edi = running pointer to input/scratch @@ -570,7 +571,7 @@ SymCryptFdefMontgomeryReduceInnerLoop: adc edx, 0 mov [edi + 12], eax mov ebp, edx - + add esi, 16 add edi, 16 sub ecx,1 @@ -606,7 +607,7 @@ SymCryptFdefMontgomeryReduceInnerLoop: mov ecx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusNdigitsOffsetX86] ; loop counter mov edx, [esp + SymCryptFdefMontgomeryReduceAsmFrame.pDst]; - + ; ecx = nDigits ; Save some values for the copy loop diff --git a/lib/i386/rc4asm.asm b/lib/i386/rc4asm.asm deleted file mode 100644 index de8f6c5..0000000 --- a/lib/i386/rc4asm.asm +++ /dev/null @@ -1,314 +0,0 @@ -; -; rc4asm.asm -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; RC4 implementation in x86 assembler -; This is a new RC4 implementation for SymCrypt. -; It is NOT based on the existing one in RSA32.lib. -; - - - TITLE "RC4" - .586P - -_TEXT SEGMENT PARA PUBLIC USE32 'CODE' - ASSUME CS:_TEXT, DS:FLAT, SS:FLAT - -include symcrypt_version.inc -include symcrypt_magic.inc - -; -; Structure definition that mirrors the SYMCRYPT_RC4_STATE struct -; - -RC4_STATE struct - S db 256 dup (?) - i db ? - j db ? - - SYMCRYPT_MAGIC_FIELD - -RC4_STATE ends - - - PUBLIC @SymCryptRc4InitAsm@12 - PUBLIC @SymCryptRc4CryptAsm@16 - - -BEFORE_PROC MACRO - ; - ; Our current x86 compiler inserts 5 0xcc bytes before every function - ; and starts every function with a 2-byte NOP. - ; This supports hot-patching. - ; - DB 5 dup (0cch) - ENDM - - -; The .FPO provides debugging information. -; This stuff not well documented, -; but here is the information I've gathered about the arguments to .FPO -; -; In order: -; cdwLocals: Size of local variables, in DWords -; cdwParams: Size of parameters, in DWords. Given that this is all about -; stack stuff, I'm assuming this is only about parameters passed -; on the stack. -; cbProlog : Number of bytes in the prolog code. We have interleaved the -; prolog code with work for better performance. Most uses of -; .FPO seem to set this value to 0 anyway, which is what we -; will do. -; cbRegs : # registers saved in the prolog. 4 in our case -; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer -; cbFrame : Type of frame. -; 0 = FPO frame (no frame pointer) -; 1 = Trap frame (result of a CPU trap event) -; 2 = TSS frame -; -; Having looked at various occurrences of .FPO in the Windows code it -; seems to be used fairly sloppy, with lots of arguments left 0 even when -; they probably shouldn't be according to the spec. -; - - - - BEFORE_PROC - -@SymCryptRc4InitAsm@12 PROC -;VOID -;SYMCRYPT_CALL -;SymCryptRc4InitAsm( -; _Out_ PSYMCRYPT_RC4_STATE pState, -; _In_reads_bytes_( cbKey ) PCBYTE pbKey, -; _In_ SIZE_T cbKey ); -; -; NOTE: Unlike the SymCryptRc4Init function -; this function does not check the cbKey validity, and does not return an error code. -; Currently we don't have the error code values symbolically in the asm environment. -; We use an inlined function to generate the errors instead, and call this function -; only when there are no errors. -; - -Rc4InitFrame struct 4, NONUNIQUE - -pbKey dd ? -SaveEdi dd ? -SaveEsi dd ? -SaveEbp dd ? -SaveEbx dd ? -ReturnAddress dd ? -cbKey dd ? - -Rc4InitFrame ends - - .FPO(5,1,0,4,0,0) - - ; ecx = pState - ; edx = pKey - ; [esp + 4] = cbKey - - ; - ; Set up stack frame, and initialize pbKey - ; - mov edi,edi ; 2-byte NOP for hot-patching - - push ebx - push ebp - push esi - push edi - push edx - - ; - ; Initialize S[i] = i - ; - lea esi,[ecx + 100h] - mov edi,ecx - - mov eax,03020100h - mov ebx,04040404h - -@@: - mov [edi],eax - add eax,ebx - mov [edi+4],eax - add eax,ebx - mov [edi+8],eax - add eax,ebx - mov [edi+12],eax - add eax,ebx - add edi,16 - cmp edi,esi - jb @B - - - mov ebp,edx - xor ebx,ebx ; j = 0 - xor esi,esi ; i = 0 - mov edi,[esp + Rc4InitFrame.cbKey] - add edi, edx ; edi = pbKey + cbKey - -SymCryptRc4InitMainLoop: - ; Registers: - ; eax = Tmp1 - ; ebx = j - ; ecx = S - ; edx = Tmp2 - ; esi = i - ; edi = keyLimit ; just beyond the key - ; ebp = pKey ; pointer to current key byte - - movzx edx,byte ptr[ebp] ; get key byte - add ebx,edx ; j += key byte - movzx eax,byte ptr[ecx + esi] ; get S[i] - add ebx,eax ; j += S[i] - - and ebx,0ffh - - movzx edx,byte ptr [ecx + ebx]; get S[j] - mov byte ptr[ecx + ebx], al ; update S[j] - mov byte ptr[ecx + esi], dl ; update S[i] - - add ebp,1 ; increment key pointer modulo key length - cmp ebp,edi - jb @F - mov ebp,[esp + Rc4InitFrame.pbKey] -@@: - - add esi,1 ; increment i - cmp esi,100h - jb SymCryptRc4InitMainLoop - - mov word ptr [ecx + RC4_STATE.i], 1 ; i = 1; j = 0 - - add esp,4 - pop edi - pop esi - pop ebp - pop ebx - ret 4 - - -@SymCryptRc4InitAsm@12 ENDP - - - - - BEFORE_PROC - -@SymCryptRc4CryptAsm@16 PROC -;VOID -;SYMCRYPT_CALL -;SymCryptRc4Crypt( -; _Inout_ PSYMCRYPT_RC4_STATE pState, -; _In_reads_bytes_( cbData ) PCBYTE pbSrc, -; _Out_writes_bytes_( cbData ) PBYTE pbDst, -; _In_ SIZE_T cbData ) - -Rc4CryptFrame struct 4, NONUNIQUE -pbEndDst dd ? -SaveEdi dd ? -SaveEsi dd ? -SaveEbp dd ? -SaveEbx dd ? -ReturnAddress dd ? -pbDst dd ? -cbData dd ? - -Rc4CryptFrame ends - - .FPO(5,2,0,4,0,0) - - - mov edi,edi - - push ebx - push ebp - push esi - push edi - sub esp,4 - - SYMCRYPT_CHECK_MAGIC ecx, RC4_STATE - - mov eax,[esp + Rc4CryptFrame.cbData] - test eax,eax - jz Rc4CryptDoNothing - - mov ebp,[esp + Rc4CryptFrame.pbDst] - add eax,ebp - mov [esp + Rc4CryptFrame.pbEndDst], eax - - mov edi, edx - movzx edx,[ecx + RC4_STATE.i] - movzx esi,[ecx + RC4_STATE.j] - - ; - ; Further perf improvements are possible. - ; Instead of encrypting byte-by-byte, we can collect 4 bytes of the key - ; stream in a register, and then encrypt 4 bytes at a time. - ; This reduces the # memory operations we do per byte. - ; Ideally this is done with aligned operations, either - ; aligning to pbSrc, pbDst, or to i (which removes the need to increment i every time). - ; - -@@: - ; eax Ti - ; ebx Tj - ; ecx S - ; edx i - ; esi j - ; edi pSrc - ; ebp pDst - - movzx eax, byte ptr[ecx + edx] ; Ti = S[i] - - ;add esi, eax - ;and esi, 0ffh - lea ebx, [esi + eax] - movzx esi, bl ; j += Ti - - movzx ebx, byte ptr[ecx + esi] ; Tj = S[j] - mov [ecx + edx], bl ; S[i] = Tj - mov [ecx + esi], al ; S[j] = Ti - - ;add eax,ebx - ;and eax,0ffh - lea eax,[eax + ebx] - movzx eax,al ; Ti = Ti + Tj - - mov al,[ecx + eax] ; Til = S[Ti] - - ;add edx, 1 - ;and 0ffh - lea edx,[edx + 1] - movzx edx,dl ; i += 1 - - xor al,[edi] - add edi,1 - mov [ebp],al - add ebp, 1 - - cmp ebp,[esp + Rc4CryptFrame.pbEndDst] - jb @B - - mov eax, esi - mov [ecx + RC4_STATE.i], dl - mov [ecx + RC4_STATE.j], al - -Rc4CryptDoNothing: - - add esp,4 - pop edi - pop esi - pop ebp - pop ebx - ret 8 - - -@SymCryptRc4CryptAsm@16 ENDP - - - -_TEXT ENDS - - END diff --git a/lib/i386/sha1asm.asm b/lib/i386/sha1asm.asm deleted file mode 100644 index 5de211d..0000000 --- a/lib/i386/sha1asm.asm +++ /dev/null @@ -1,383 +0,0 @@ -; -; Sha1Asm.Asm -; -; Copyright (c) Microsoft Corporation. Licensed under the MIT license. -; -; - -; -; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm. -; for the x86 processor architecture. -; -; This implementation is derived from an older one by Scott Field and -; Dan Shumow. -; -; This implementation is optimized for Intel Core and contemporary AMD CPUs. -; Optimizations for pre-P3 Intel CPUs has been removed. -; - - - TITLE sha1asm.asm - .486 - -_TEXT SEGMENT PARA PUBLIC USE32 'CODE' - ASSUME CS:_TEXT, DS:FLAT, SS:FLAT - - PUBLIC @SymCryptSha1AppendBlocksAsm@12 - - ; - ; The four round constants used by SHA-1 - ; - -K0_19 EQU 05a827999H -K20_39 EQU 06ed9eba1H -K40_59 EQU 08f1bbcdcH -K60_79 EQU 0ca62c1d6H - - align 16 - -;VOID -;SYMCRYPT_CALL -;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H, -; _In_reads_bytes_( cbData ) PCBYTE pbData, -; SIZE_T cbData ) -; -@SymCryptSha1AppendBlocksAsm@12 PROC - -; -; To keep stack manipulatins simple we define a structure and use that for all accesses. -; -SymCryptSha1AppendBlocksFrame struct 4, NONUNIQUE - -Wbuf dd 16 dup (?) -Hptr dd ? -pbData dd ? -BlockCount dd ? -SaveEdi dd ? -SaveEsi dd ? -SaveEbp dd ? -SaveEbx dd ? -ReturnAddress dd ? -CbData dd ? - -SymCryptSha1AppendBlocksFrame ends - - ; - ; We use the W buffer extensively; this is a shorthand for the base address - ; -W equ esp+SymCryptSha1AppendBlocksFrame.Wbuf - - ; - ; The .FPO provides debugging information for stack frames that do not use - ; ebp as a base pointer. - ; This stuff not well documented, - ; but here is the information I've gathered about the arguments to .FPO - ; - ; In order: - ; cdwLocals: Size of local variables, in DWords - ; cdwParams: Size of parameters, in DWords. Given that this is all about - ; stack stuff, I'm assuming this is only about parameters passed - ; on the stack. - ; cbProlog : Number of bytes in the prolog code. We sometimes interleaved the - ; prolog code with work for better performance. Most uses of - ; .FPO seem to set this value to 0. - ; The debugger seems to work if the prolog defined by this value - ; contains all the stack adjustments. - ; cbRegs : # registers saved in the prolog. 4 in our case - ; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer - ; cbFrame : Type of frame. - ; 0 = FPO frame (no frame pointer) - ; 1 = Trap frame (result of a CPU trap event) - ; 2 = TSS frame - ; - ; Having looked at various occurrences of .FPO in the Windows code it - ; seems to be used fairly sloppy, with lots of arguments left 0 even when - ; they probably shouldn't be according to the spec. - ; - .FPO(23,1,3,4,0,0) ; 3 byte prolog (covers esp ajustment only) - - ; At this point: - ; ecx = H - ; edx = pbData - ; [esp+4] = cbData - - ; - ; Set up our stack frame and save non-volatile registers - ; - sub esp,SymCryptSha1AppendBlocksFrame.ReturnAddress - mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbp],ebp - mov [esp+SymCryptSha1AppendBlocksFrame.SaveEdi],edi - mov [esp+SymCryptSha1AppendBlocksFrame.SaveEsi],esi - mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbx],ebx - - mov [esp+SymCryptSha1AppendBlocksFrame.Hptr], ecx - - ; - ; To allow macro re-ordering of our registers we use symbolic names - ; for the registers. - ; r0-r4 are the 5 state registers. x1 and x2 are extra scratch registers. - ; Note: some prolog code puts the right value in the right register and - ; has to be updated if this mapping is changed. - ; -r0 EQU eax -r1 EQU ebx -r2 EQU ecx -r3 EQU edx -r4 EQU esi -x1 EQU ebp -x2 EQU edi - - ; - ; compute how many blocks we will process. - ; This code ensures that we never exceed the data buffer we were given, - ; although we silently round the cbData parameter down to the next - ; multiple of 64. - ; Do nothing if no blocks need to be processed. - ; - mov eax,[esp+SymCryptSha1AppendBlocksFrame.CbData] - shr eax,6 - jz SymCryptSha1AppendBlocksDone - mov [esp+SymCryptSha1AppendBlocksFrame.BlockCount], eax - - ; - ; The data pointer goes into x1 = ebp at the start of our loop - ; - mov ebp,edx - - ; - ; Load the H state from [ecx], making sure we load the r2=ecx register - ; last. - ; - mov r0,[ecx ] - mov r4,[ecx+ 4] - mov r3,[ecx+ 8] - mov r1,[ecx+16] - mov r2,[ecx+12] - - -SymCryptSha1AppendBlocksLoop: - ; - ; This is the main loop. We process 64 bytes in each iteration. - ; invariant: ebp = pbData - ; - - ; - ; Most of the code in the loop is generated through macros using parameters to - ; rename the registers. - ; The macros get the register number passed as parameter. They use - ; "r&" to paste the number and the 'r' together and get the register - ; name we defined above. - ; - -ROUND_CH_0_15 MACRO round,ra,rb,rc,rd,re,x1,x2 - ; - ; Code for round 0-15. - ; This code loads data from the data buffer & BSWAPs the data to get it into the - ; right form. - ; - ; Parameters: - ; round round number - ; ra register number that contains the a value - ; rb register number that contains the b value - ; rc register number that contains the c value - ; rd register number that contains the d value - ; re register number that contains the e value - ; x1 pointer to the input data - ; x2 scratch register. - ; - ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register. - ; We start with the d value as that is the oldest value and available the first - ; - ; See FIPS 180-2 for our symbolic notation. - ; - mov x2,[x1+4*round] ; Fetch word from message - bswap x2 ; x2 = Wt - add r&re,x2 ; re = e + Wt - mov [W + 4*round],x2 ; Store in W buffer for future use - - mov x2,r&ra ; x2 = a - rol x2,5 ; x2 = ROL(a,5) - add r&re,x2 ; re = e + Wt + ROL(a,5) - - mov x2,r&rd ; x2 = d - xor x2,r&rc ; x2 = (d ^ c) - and x2,r&rb ; x2 = ((d ^ c) & b) - ror r&rb,2 ; rb = ROL( b, 30 ) - xor x2,r&rd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d) - lea r&re,[r&re+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt - - ENDM - -ROUND_CH MACRO round, ra, rb, rc, rd, re, x1, x2 - ; - ; See ROUND_CH_0_15 for most parameters. - ; x1 and x2 are both scratch registers - ; - mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16] - mov x1,r&ra ; x1 = a - rol x1,5 ; x1 = ROL(a,5) - xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14] - add r&re,x1 ; re = e + ROL(a,5) - mov x1,r&rd ; x1 = d - xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] - xor x1,r&rc ; x1 = (d ^ c) - and x1,r&rb ; x1 = ((d ^ c) & b) - xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] - xor x1,r&rd ; x1 = ((d ^ c) & b) ^ d = CH(b,c,d) - rol x2,1 ; x2 = Wt - mov [W+4*((round-16) MOD 16)],x2 ; - add r&re,x2 ; re = e + ROL(a,5) + Wt - ror r&rb,2 ; rb = ROL( b, 30 ) - lea r&re,[r&re+x1+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt - ENDM - -ROUND_PARITY MACRO round, ra, rb, rc, rd, re, x1, x2, K, store - ; - ; See ROUND_CH for most parameters - ; K is the round constant to use. - ; store is 1 if the Wt value should be stored, 0 otherwise - ; (used to avoid stores in the last few rounds) - ; - ; The order of xorring the registers b, c, and d is driven by the data dependency graph. - ; We start with d (the oldest) and then do b to unblock the subsequent rotate - ; - mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16] - mov x1,r&ra ; x1 = a - rol x1,5 ; x1 = ROL(a,5) - xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14] - add r&re,x1 ; re = e + ROL(a,5) - mov x1,r&rd ; x1 = d - xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] - xor x1,r&rb ; x1 = (d ^ b) - xor x1,r&rc ; x1 = (d ^ b ^ c) = Parity(b,c,d) - xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] - rol x2,1 ; x2 = Wt - add r&re,x1 ; re = e + ROL(a,5) + Parity(b,c,d) - IF store - mov [W+4*((round-16) MOD 16)],x2 ; - ENDIF - ror r&rb,2 ; rb = ROL( b, 30 ) - lea r&re,[r&re+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt - - ENDM - -ROUND_MAJ MACRO round, ra, rb, rc, rd, re, x1, x2 - ; - ; See above for parameter explanation - ; - mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16] - mov x1,r&ra ; x1 = a - rol x1,5 ; x1 = ROL(a,5) - xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14] - add r&re,x1 ; re = e + ROL(a,5) - mov x1,r&rd ; x1 = d - xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] - or x1,r&rc ; x1 = (d | c) - and x1,r&rb ; x1 = ((d | c) & b) - xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] = Wt - rol x2,1 ; x2 = Wt - add r&re,x2 ; re = e + ROL(a,5) + Wt - mov [W+4*((round-16) MOD 16)],x2 ; - - mov x2,r&rc ; x2 = c - and x2,r&rd ; x2 = (c & d) - or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d) - - ror r&rb,2 ; rb = ROL( b, 30 ) - - lea r&re,[r&re+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt - ENDM - - ; - ; With these macros we can now produce the actual code. - ; Note the use of the % operator which evaluates the expression and yields the result as text. - ; Together with the macros and the r EQUs this provides us with automatic register renaming - ; for each round. - ; - FOR t, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15> - ROUND_CH_0_15 t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2 - ENDM - - ; - ; For the rest of the computation we need the extra register, so we update the data pointer and store it. - ; - add ebp,64 - mov [esp+SymCryptSha1AppendBlocksFrame.pbData], ebp - - FOR t, <16, 17, 18, 19> - ROUND_CH t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2 - ENDM - - FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39> - ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K20_39, 1 - ENDM - - FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59> - ROUND_MAJ t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2 - ENDM - - FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76> - ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 1 - ENDM - - ; - ; The last three rounds do not need to store their Wt in the W buffer as that value will never get used. - ; - FOR t, <77, 78, 79> - ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 0 - ENDM - - ; - ; Now we update the state - ; - mov x2,[esp+SymCryptSha1AppendBlocksFrame.Hptr] - add r0,[x2 ] - add r4,[x2+ 4] - add r3,[x2+ 8] - add r2,[x2+12] - add r1,[x2+16] - - mov [x2 ], r0 - mov [x2+ 4], r4 - mov [x2+ 8], r3 - mov [x2+12], r2 - mov [x2+16], r1 - - ; - ; See if we have more data to process, and load the data pointer register again - ; - dec [esp+SymCryptSha1AppendBlocksFrame.BlockCount] - mov ebp, [esp+SymCryptSha1AppendBlocksFrame.pbData] - jnz SymCryptSha1AppendBlocksLoop - - ; - ; We're done processing the blocks. The result is already in the state, so all we have to do - ; is clean up. - ; - ; Wipe the W buffer - ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read. - ; - mov ecx,8 - xor eax,eax -@@: dec ecx - mov [esp+8*ecx],eax - mov [esp+8*ecx+4],eax - jnz @B - -SymCryptSha1AppendBlocksDone: - ; - ; Restore non-volatile regisers & stackpointer - ; - mov ebp,[esp+SymCryptSha1AppendBlocksFrame.SaveEbp] - mov edi,[esp+SymCryptSha1AppendBlocksFrame.SaveEdi] - mov esi,[esp+SymCryptSha1AppendBlocksFrame.SaveEsi] - mov ebx,[esp+SymCryptSha1AppendBlocksFrame.SaveEbx] - add esp,SymCryptSha1AppendBlocksFrame.ReturnAddress - - ret 4 - -@SymCryptSha1AppendBlocksAsm@12 ENDP -_TEXT ENDS - -END - diff --git a/lib/libmain.c b/lib/libmain.c index c8e050d..809b664 100644 --- a/lib/libmain.c +++ b/lib/libmain.c @@ -7,9 +7,7 @@ #include "precomp.h" -#define EQU = #include "C_asm_shared.inc" -#undef EQU #include "buildInfo.h" @@ -34,16 +32,16 @@ SymCryptLibraryWasNotInitialized() #endif -const CHAR * SymCryptBuildString = - "v" SYMCRYPT_BUILD_INFO_VERSION - "_" SYMCRYPT_BUILD_INFO_BRANCH +const CHAR * SymCryptBuildString = + "v" SYMCRYPT_BUILD_INFO_VERSION + "_" SYMCRYPT_BUILD_INFO_BRANCH "_" SYMCRYPT_BUILD_INFO_COMMIT "_" SYMCRYPT_BUILD_INFO_TIMESTAMP; VOID SYMCRYPT_CALL SymCryptInitEnvCommon( UINT32 version ) -// Returns TRUE if the initializatoin steps have to be performed. +// Returns TRUE if the initialization steps have to be performed. { UINT32 tmp; diff --git a/lib/linux/asmstubs.c b/lib/linux/asmstubs.c deleted file mode 100644 index d1fdbc3..0000000 --- a/lib/linux/asmstubs.c +++ /dev/null @@ -1,223 +0,0 @@ -// -// asmstubs.c -// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM -// -// Copyright (c) Microsoft Corporation. Licensed under the MIT license. -// - -#include "../precomp.h" - -extern const SYMCRYPT_BLOCKCIPHER SymCryptAesBlockCipherNoOpt; - -VOID -SYMCRYPT_CALL -SymCryptAesEncryptAsm( - _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, - _In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc, - _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst ) -{ - SymCryptAesEncryptC( pExpandedKey, pbSrc, pbDst ); -} - -VOID -SYMCRYPT_CALL -SymCryptAesDecryptAsm( - _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, - _In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc, - _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst ) -{ - SymCryptAesDecryptC( pExpandedKey, pbSrc, pbDst ); -} - -VOID -SYMCRYPT_CALL -SymCryptAesCbcEncryptAsm( - _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, - _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, - _In_reads_( cbData ) PCBYTE pbSrc, - _Out_writes_( cbData ) PBYTE pbDst, - SIZE_T cbData ) -{ - SymCryptCbcEncrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData ); -} - -VOID -SYMCRYPT_CALL -SymCryptAesCbcDecryptAsm( - _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, - _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, - _In_reads_( cbData ) PCBYTE pbSrc, - _Out_writes_( cbData ) PBYTE pbDst, - SIZE_T cbData ) -{ - SymCryptCbcDecrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData ); -} - -VOID -SYMCRYPT_CALL -SymCryptAesCtrMsb64Asm( - _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey, - _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue, - _In_reads_( cbData ) PCBYTE pbSrc, - _Out_writes_( cbData ) PBYTE pbDst, - SIZE_T cbData ) -{ - SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy - SymCryptCtrMsb64( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData ); -} - -VOID -SYMCRYPT_CALL -SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData ) -{ - volatile BYTE * p = (volatile BYTE *) pbData; - SIZE_T i; - - for( i=0; i to denote registers in the symcryptasm register naming scheme. + + +A leaf function (a function which does not call another function) begins with an invocation of the +FUNCTION_START macro which currently takes 3 arguments: +1) The function name + This must be the name that matches the corresponding declaration of the function +2) The number of arguments (arg_count) that the function takes + These arguments will be accessible in some contiguous region of the symcrypt registers at the + start of the function + On amd64 this contiguous region is R1..R + Note: arg_count need not correspond to the exact number of argument in the function declaration + if the assembly does not use some tail of the arguments +3) The number of registers (reg_count) that the function uses + These registers will be accessible as R0..R + +A leaf function ends with the FUNCTION_END macro, which also takes the function name + (a FUNCTION_END macro's function name must match the preceding FUNCTION_START's name) + +At the function start a prologue is generated which arranges the arguments appropriately in +registers, and saves non-volatile registers that have been requested to be used. +At the function end an epilogue is generated with restores the non-volatile registers and returns. + + +A nested function (a function which does call another function) is specified similarly, only using +NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and align +the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI. + + +A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable +number of macros argument names. It ends with MACRO_END. + +### Architecture specifics ### + +### amd64 ### +We allow up to 15 registers to be addressed, with the names: +Q0-Q15 (64-bit registers), W0-W15 (32-bit registers), H0-H15 (16-bit registers), and B0-B15 (8-bit +registers) +Xmm0-Xmm5 registers may be used directly in assembly too, as in both amd64 calling conventions we +currently support, these registers are volatile so do not need any special handling + +On function entry we insert a prologue which ensures: +Q0 is the result register (the return value of the function, and the low half of a multiplication) +Q1-Q6 are the first 6 arguments passed to the function + +Additionally, there is a special case for functions using mul or mulx instructions, as these +instructions make rdx a special register. Functions using these instructions may address Q0-Q14, +and QH. As rdx is used to pass arguments, its value is moved to another register in the function +prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case. + We currently do not support nested mul functions, as we have none of them. + +""" + +import re +import types +import logging + +class Register: + """A class to represent registers""" + + def __init__(self, name64, name32, name16, name8): + self.name64 = name64 + self.name32 = name32 + self.name16 = name16 + self.name8 = name8 + +# amd64 registers +REG_RAX = Register("rax", "eax", "ax", "al") +REG_RBX = Register("rbx", "ebx", "bx", "bl") +REG_RCX = Register("rcx", "ecx", "cx", "cl") +REG_RDX = Register("rdx", "edx", "dx", "dl") +REG_RSI = Register("rsi", "esi", "si", "sil") +REG_RDI = Register("rdi", "edi", "di", "dil") +REG_RSP = Register("rsp", "esp", "sp", "spl") +REG_RBP = Register("rbp", "ebp", "bp", "bpl") +REG_R8 = Register( "r8", "r8d", "r8w", "r8b") +REG_R9 = Register( "r9", "r9d", "r9w", "r9b") +REG_R10 = Register("r10", "r10d", "r10w", "r10b") +REG_R11 = Register("r11", "r11d", "r11w", "r11b") +REG_R12 = Register("r12", "r12d", "r12w", "r12b") +REG_R13 = Register("r13", "r13d", "r13w", "r13b") +REG_R14 = Register("r14", "r14d", "r14w", "r14b") +REG_R15 = Register("r15", "r15d", "r15w", "r15b") + +class CallingConvention: + """A class to represent calling conventions""" + + def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn): + self.name = name + self.architecture = architecture + self.mapping = mapping + self.argument_registers = argument_registers + self.volatile_registers = volatile_registers + self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self) + self.gen_epilogue_fn = types.MethodType(gen_epilogue_fn, self) + self.gen_get_memslot_offset_fn = types.MethodType(gen_get_memslot_offset_fn, self) + + +def get_mul_mapping_from_normal_mapping(mapping, argument_registers): + """Gets the register mapping used in functions requiring special rdx handling. + + In amd64, when using mul and mulx, rdx is a special register. + rdx is also used for passing arguments in both Msft and System V calling conventions. + In asm functions that use mul or mulx, we will explicitly move the argument passed in + rdx to a different volatile register in the function prologue, and in the function body + we refer to rdx using (Q|D|W|B)H. + """ + rdx_index = None + return_mapping = { 'H': REG_RDX } + for (index, register) in mapping.items(): + if register == REG_RDX: + rdx_index = index + break + for (index, register) in mapping.items(): + # preserve argument registers + if (index <= argument_registers) and (index != rdx_index): + return_mapping[index] = register + # replace rdx with the first non-argument register + if index == argument_registers+1: + return_mapping[rdx_index] = register + # shuffle all later registers down to fill the gap + if index > argument_registers+1: + return_mapping[index-1] = register + return return_mapping + +# Calling convention constants + +MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now +MAX_FUNCTION_REGISTER_COUNT = 15 + +# Microsoft x64 calling convention +MAPPING_AMD64_MSFT = { + 0: REG_RAX, # Result register + 1: REG_RCX, # Argument 1 / volatile + 2: REG_RDX, # Argument 2 / volatile + 3: REG_R8, # Argument 3 / volatile + 4: REG_R9, # Argument 4 / volatile + 5: REG_R10, # volatile + 6: REG_R11, # volatile + 7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue + 8: REG_RDI, + 9: REG_RBP, + 10:REG_RBX, + 11:REG_R12, + 12:REG_R13, + 13:REG_R14, + 14:REG_R15, + # currently not mapping rsp +} + +def calc_amd64_shadow_space_allocation_size(self, reg_count): + # If we are a nested function, we must allocate 32B of shadow space on the stack, and ensure the + # stack pointer is aligned to 16B + # Before the prologue we have rsp % 16 == 8 - as the call pushed an 8B return address on an + # aligned stack + alignment = 8 + # We then pushed some number of additional 8B registers onto the stack + if reg_count > self.volatile_registers: + alignment = (alignment + (8 * (self.volatile_registers - reg_count))) % 16 + shadow_space_allocation_size = 32 + if alignment == 8: + # possibly allocate 8 more bytes to align the stack to 16B + shadow_space_allocation_size += 8 + return shadow_space_allocation_size + +def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=False): + prologue = "\n" + if reg_count > self.volatile_registers: + prologue += "rex_push_reg Q%s\n" % self.volatile_registers + for i in range(self.volatile_registers+1, reg_count): + prologue += "push_reg Q%s\n" % i + prologue += "\nEND_PROLOGUE\n\n" + + shadow_space_allocation_size = 0 + + if nested: + shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count) + prologue += "sub rsp, %d // allocate shadow space and align stack\n\n" % shadow_space_allocation_size + + prologue += mul_fixup + + # put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now) + # stack_offset to get the 5th argument is: + # 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size + stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size + for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)): + prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset) + stack_offset += 8 + return prologue + +def gen_prologue_amd64_msft_mul(self, arg_count, reg_count): + return gen_prologue_amd64_msft(self, arg_count, reg_count, "mov Q2, QH\n") + +def gen_prologue_amd64_msft_nested(self, arg_count, reg_count): + return gen_prologue_amd64_msft(self, arg_count, reg_count, "", nested=True) + +def gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=False): + epilogue = "" + + if nested: + shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count) + epilogue += "add rsp, %d // deallocate shadow space and align stack\n\n" % shadow_space_allocation_size + + if reg_count > self.volatile_registers: + epilogue += "BEGIN_EPILOGUE\n" + for i in reversed(range(self.volatile_registers, reg_count)): + epilogue += "pop Q%s\n" % i + epilogue += "ret\n" + return epilogue + +def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count): + return gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=True) + +def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False): + # only support 4 memory slots for now (in shadow space) + if(slot >= 4): + logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) + exit(1) + # 8B for return address + (8*#pushed registers in prologue) + stack_offset = 8 + (8*(reg_count-self.volatile_registers)) + if nested: + stack_offset += calc_amd64_shadow_space_allocation_size(self, reg_count) + return "%d /*MEMSLOT%d*/" % (stack_offset+(8*slot), slot) + +def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count): + return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True) + +CALLING_CONVENTION_AMD64_MSFT = CallingConvention( + "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7, + gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft) +CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention( + "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6, + gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft) +CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention( + "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7, + gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested) + +# AMD64 System V calling convention +MAPPING_AMD64_SYSTEMV = { + 0: REG_RAX, # Result register + 1: REG_RDI, # Argument 1 / volatile + 2: REG_RSI, # Argument 2 / volatile + 3: REG_RDX, # Argument 3 / volatile + 4: REG_RCX, # Argument 4 / volatile + 5: REG_R8, # Argument 5 / volatile + 6: REG_R9, # Argument 6 / volatile + 7: REG_R10, # volatile + 8: REG_R11, # volatile + 9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue + 10:REG_RBP, + 11:REG_R12, + 12:REG_R13, + 13:REG_R14, + 14:REG_R15 + # currently not mapping rsp +} + +def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=False): + # push volatile registers onto the stack + prologue = "\n" + if reg_count > self.volatile_registers: + for i in range(self.volatile_registers, reg_count): + prologue += "push Q%s\n" % i + + # If we are a nested function, we need to align the stack to 16B, and allocate space for up to 4 + # memory slots not in the redzone. We can use the same logic as on the MSFT x64 side to allocate + # our own space for 32B of local variables (whereas on the MSFT side, we use this for allocating + # space for a function we are about to call) + if nested: + allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count) + prologue += "sub rsp, %d // allocate memslot space and align stack\n\n" % allocation_size + + prologue += mul_fixup + + # do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now + # # put additional arguments into Q7-Qn + # # stack_offset to get the 7th argument is: + # # 8B for return address + # stack_offset = 8 + # for i in range(self.argument_registers+1, arg_count+1): + # prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset) + # stack_offset += 8 + + return prologue + +def gen_prologue_amd64_systemv_mul(self, arg_count, reg_count): + return gen_prologue_amd64_systemv(self, arg_count, reg_count, "mov Q3, QH\n") + +def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count): + return gen_prologue_amd64_systemv(self, arg_count, reg_count, "", nested=True) + +def gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=False): + epilogue = "" + + if nested: + allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count) + epilogue += "add rsp, %d // deallocate memslot space and align stack\n\n" % allocation_size + + if reg_count > self.volatile_registers: + for i in reversed(range(self.volatile_registers, reg_count)): + epilogue += "pop Q%s\n" % i + epilogue += "ret\n" + return epilogue + +def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count): + return gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=True) + +def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False): + # only support 4 memory slots for now + if(slot >= 4): + logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot) + exit(1) + # For leaf functions, use the top of the redzone below the stack pointer + offset = -8 * (slot+1) + if nested: + # For nested functions, use the 32B of memslot space above the stack pointer created in the prologue + offset = 8*slot + return "%d /*MEMSLOT%d*/" % (offset, slot) + +def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count): + return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True) + +CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention( + "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9, + gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv) +CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention( + "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8, + gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv) +CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention( + "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9, + gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested) + + +def gen_function_start_defines(mapping, arg_count, reg_count): + defines = "" + for (index, reg) in mapping.items(): + if (index != 'H') and (index >= max(arg_count+1, reg_count)): + continue + defines += "#define Q%s %s\n" % (index, reg.name64) + defines += "#define D%s %s\n" % (index, reg.name32) + defines += "#define W%s %s\n" % (index, reg.name16) + defines += "#define B%s %s\n" % (index, reg.name8) + return defines + +def gen_function_end_defines(mapping, arg_count, reg_count): + undefs = "" + for (index, _) in mapping.items(): + if (index != 'H') and (index >= max(arg_count+1, reg_count)): + continue + undefs += "#undef Q%s\n" % (index) + undefs += "#undef D%s\n" % (index) + undefs += "#undef W%s\n" % (index) + undefs += "#undef B%s\n" % (index) + return undefs + +MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n" +MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n" +MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n" +MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n" + +GAS_FUNCTION_ENTRY = "%s: .global %s\n" +GAS_FUNCTION_END = "" + +def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested): + function_entry = None + if assembler == "masm": + # need to identify and mark up frame functions in masm + if nested or (reg_count > calling_convention.volatile_registers): + function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name) + else: + function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name) + elif assembler == "gas": + function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name) + + prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count) + prologue += "%s" % (function_entry) + prologue += calling_convention.gen_prologue_fn(arg_count, reg_count) + + return prologue + +def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested): + function_end = None + if assembler == "masm": + # need to identify and mark up frame functions in masm + if nested or (reg_count > calling_convention.volatile_registers): + function_end = MASM_FRAME_FUNCTION_END % (function_name) + else: + function_end = MASM_FRAMELESS_FUNCTION_END % (function_name) + elif assembler == "gas": + function_end = GAS_FUNCTION_END + + epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count) + epilogue += "%s" % (function_end) + epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count) + + return epilogue + +MASM_MACRO_START = "%s MACRO %s\n" +MASM_MACRO_END = "ENDM\n" +GAS_MACRO_START = ".macro %s %s\n" +GAS_MACRO_END = ".endm\n" +MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n" +GAS_ALTERNATE_ENTRY = "%s: .global %s\n" + + +FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)") +FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)") +GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)") +ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)") +MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)") +MACRO_END_PATTERN = re.compile("\s*MACRO_END\s*\(\s*\)") + +class ProcessingStateMachine: + """A class to hold the state when processing a file and handle files line by line""" + + def __init__(self, assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention): + self.assembler = assembler + self.normal_calling_convention = normal_calling_convention + self.mul_calling_convention = mul_calling_convention + self.nested_calling_convention = nested_calling_convention + + self.function_start_match = None + self.function_start_line = 0 + self.is_nested_function = None + self.is_mul_function = None + self.calling_convention = None + self.function_name = None + self.arg_count = None + self.reg_count = None + + self.macro_start_match = None + self.macro_name = None + self.macro_args = None + + def process_line(self, line, line_num): + if self.function_start_match == None and self.macro_start_match == None: + return self.process_normal_line(line, line_num) + elif self.function_start_match != None: + return self.process_function_line(line, line_num) + elif self.macro_start_match != None: + return self.process_macro_line(line, line_num) + else: + logging.error("Whoops, something is broken with the state machine (failed at line %d)" % line_num) + exit(1) + + def process_normal_line(self, line, line_num): + # Not currently in a function or macro + match = FUNCTION_START_PATTERN.match(line) + if (match): + return self.process_start_function(match, line, line_num) + + match = MACRO_START_PATTERN.match(line) + if (match): + return self.process_start_macro(match, line, line_num) + + # Not starting a function or a macro + return line + + def process_start_function(self, match, line, line_num): + # Entering a new function + self.function_start_match = match + self.function_start_line = line_num + self.is_nested_function = (match.group(1) == "NESTED_") + self.is_mul_function = (match.group(2) == "MUL_") + self.function_name = match.groups()[-3] + self.arg_count = int(match.groups()[-2]) + self.reg_count = int(match.groups()[-1]) + + if self.is_nested_function and self.is_mul_function: + logging.error( + "Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t" + "%s (line %d)" + % (line, line_num)) + exit(1) + if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT: + logging.error( + "Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t" + "%s (line %d)" + % (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num)) + exit(1) + if self.reg_count > MAX_FUNCTION_REGISTER_COUNT: + logging.error( + "Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t" + "%s (line %d)" + % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num)) + exit(1) + if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1: + logging.error( + "Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t" + "%s (line %d)" + % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num)) + exit(1) + + logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count)) + + if self.is_nested_function: + self.calling_convention = self.nested_calling_convention + elif self.is_mul_function: + self.calling_convention = self.mul_calling_convention + else: + self.calling_convention = self.normal_calling_convention + + return generate_prologue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function) + + def process_start_macro(self, match, line, line_num): + self.macro_start_match = match + self.macro_name = match.group(1) + self.macro_args = [ x.strip() for x in match.group(2).split(",") ] + + logging.info("%d: macro start %s, %s" % (line_num, self.macro_name, self.macro_args)) + + if self.assembler == "masm": + return MASM_MACRO_START % (self.macro_name, match.group(2)) + elif self.assembler == "gas": + return GAS_MACRO_START % (self.macro_name, match.group(2)) + + def process_function_line(self, line, line_num): + # Currently in a function + + match = ALTERNATE_ENTRY_PATTERN.match(line) + if (match): + if self.assembler == "masm": + return MASM_ALTERNATE_ENTRY % match.group(1) + elif self.assembler == "gas": + return GAS_ALTERNATE_ENTRY % (match.group(1), match.group(1)) + + match = FUNCTION_END_PATTERN.match(line) + if (match): + # Check the end function has same prefix as previous start function + if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \ + (self.is_mul_function ^ (match.group(2) == "MUL_")): + logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" + % (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num)) + exit(1) + # Check the end function pattern has the same label as the previous start function pattern + if self.function_name != match.groups()[-1]: + logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" + % (self.function_name, self.function_start_line, match.groups()[-1], line_num)) + exit(1) + + epilogue = generate_epilogue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function) + + logging.info("%d: function end %s" % (line_num, self.function_name)) + + self.function_start_match = None + self.function_start_line = 0 + self.is_nested_function = None + self.is_mul_function = None + self.calling_convention = None + self.function_name = None + self.arg_count = None + self.reg_count = None + + return epilogue + + # replace any GET_MEMSLOT_OFFSET macros in line + match = GET_MEMSLOT_PATTERN.search(line) + while(match): + slot = int(match.group(1)) + replacement = self.calling_convention.gen_get_memslot_offset_fn(slot, self.arg_count, self.reg_count) + line = GET_MEMSLOT_PATTERN.sub(replacement, line) + match = GET_MEMSLOT_PATTERN.search(line) + + logging.info("%d: memslot macro %d" % (line_num, slot)) + + # Not modifying the line any further + return line + + def process_macro_line(self, line, line_num): + # Currently in a macro + match = MACRO_END_PATTERN.match(line) + if (match): + logging.info("%d: macro end %s" % (line_num, self.macro_name)) + + self.macro_start_match = None + self.macro_name = None + self.macro_args = None + + if self.assembler == "masm": + return MASM_MACRO_END + elif self.assembler == "gas": + return GAS_MACRO_END + + if self.assembler == "gas": + # In GAS macros we need to escape all of the macro arguments with a backslash in the macro body + for arg in self.macro_args: + line = re.sub(arg, r"\\%s" % arg, line) + + # Not modifying the line any further + return line + +def process_file(target, infilename, outfilename): + assembler = None + if target == "masm": + assembler = "masm" + normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT + mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL + nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED + elif target == "gas": + assembler = "gas" + normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV + mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL + nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED + + # iterate through file line by line in one pass + file_processing_state = ProcessingStateMachine( + assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention) + + with open(infilename) as infile: + with open(outfilename, "w") as outfile: + for line_num, line in enumerate(infile): + processed_line = file_processing_state.process_line(line, line_num) + outfile.write(processed_line) + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS") + parser.add_argument('target', type=str, help='Target that we want to preprocess for') + parser.add_argument('inputfile', type=str, help='Path to input file') + parser.add_argument('outputfile', type=str, help='Path to output file') + + args = parser.parse_args() + process_file(args.target, args.inputfile, args.outputfile) diff --git a/unittest/lib/main.cpp b/unittest/lib/main.cpp index e9b46ec..2ade7d7 100644 --- a/unittest/lib/main.cpp +++ b/unittest/lib/main.cpp @@ -7,7 +7,6 @@ #include "precomp.h" -#define EQU = #include "C_asm_shared.inc" VOID