зеркало из https://github.com/microsoft/SymCrypt.git
Merged PR 5854070: Introduce symcryptasm format to enable use of asm in Windows and Linux
+ Introduce a 2 stage pre-processing setup to convert .symcryptasm to either masm (msft x64 calling convention) or gas (SystemV amd64 calling convention) + Step 1 converts .symcryptasm to .cppasm (using `lib\symcryptasm_processor.py`) + Step 2 converts .cppasm to .asm using the C preprocessor + Updated CMakeLists.txt to invoke this preprocesssing when any relevant files is updated + Also introduced makefile.inc for the razzle build + I have translated all of the amd64 asm files we want to preserve, and the performance for big integer reliant code is the same on Windows and Linux (and a bit better on Windows than before :)) + In translation I did some tidying of the underlying assembly: + Removing needless work (some size specific functions in particular had cruft from their adaptation from the generic sized versions) + Reducing code size (i.e. by using inc/dec rather than add/sub 1) + Some micro-optimizations to remove needless instruction dependencies Related work items: #30621935
This commit is contained in:
Родитель
27765f9929
Коммит
77d1e446e4
|
@ -28,9 +28,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/${CMAKE_SYSTEM_PROCES
|
|||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/exe/${CMAKE_SYSTEM_PROCESSOR}/${SYMCRYPT_TARGET_ENV})
|
||||
|
||||
if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
|
||||
# Set DBG=1 and enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in
|
||||
# the toolchain file
|
||||
add_compile_options(-DDBG=1)
|
||||
# Enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in the
|
||||
# toolchain file
|
||||
enable_language(ASM_MASM)
|
||||
add_compile_options(/MP)
|
||||
# Remove /RTC1, incompatible of /Ox
|
||||
|
@ -43,16 +42,23 @@ if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
|
|||
string( REPLACE "/Od" "" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
|
||||
string( REPLACE "/Od" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
|
||||
string( REPLACE "/Od" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
|
||||
|
||||
IF(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
message("Release mode")
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
add_compile_options(/Oxs)
|
||||
ENDIF(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
endif()
|
||||
elseif(NOT WIN32)
|
||||
enable_language(ASM)
|
||||
add_compile_options(-Wno-deprecated-declarations -Wno-deprecated)
|
||||
add_compile_options(-g)
|
||||
add_compile_options(-Wno-multichar)
|
||||
add_compile_options(-fPIC)
|
||||
endif()
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
message("Release mode")
|
||||
else()
|
||||
message("Debug mode")
|
||||
add_compile_options(-DDBG=1)
|
||||
endif()
|
||||
|
||||
include_directories(inc)
|
||||
|
|
11
README.md
11
README.md
|
@ -1,4 +1,4 @@
|
|||
# Introduction
|
||||
# Introduction
|
||||
SymCrypt is the core cryptographic function library currently used by Windows.
|
||||
|
||||
## History
|
||||
|
@ -30,20 +30,23 @@ or gcc 7.4.0 on Linux. Note that CMake ships with Visual Studio 2019.
|
|||
4. Configure CMake compilation:
|
||||
* For 32-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-x86.cmake -A Win32`
|
||||
* For 64-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-amd64.cmake`
|
||||
* For Linux (or Windows with no CPU optimizations): `cmake ..`
|
||||
* For 64-bit Linux targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/linux-amd64.cmake`
|
||||
* For no CPU optimizations: `cmake ..`
|
||||
* Optionally, for a release build, specify `-DCMAKE_BUILD_TYPE=Release`
|
||||
5. `cmake --build .`
|
||||
* Optionally specify -jN where N is the number of processes you wish to spawn for the build
|
||||
|
||||
If compilation succeeds, the output will be put in the `exe` subdirectory relative to where compilation occurred
|
||||
(i.e. `bin/exe` if you followed the instructions above).
|
||||
|
||||
The SymCrypt unit test is in the `unittest` directory. It runs extensive functional tests on the SymCrypt
|
||||
library. On Windows it also compares results against on other implementations such as the Windows APIs CNG
|
||||
and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
|
||||
and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
|
||||
detailed performance information.
|
||||
|
||||
# Security Bugs
|
||||
If you believe you have found a problem that affects the security of this code, please do **NOT** create an issue
|
||||
or pull request, but instead email your comments to secure@microsoft.com.
|
||||
or pull request, but instead email your comments to secure@microsoft.com.
|
||||
|
||||
# Contribute
|
||||
We love to receive comments and suggestions. Unfortunately we cannot accept external code contributions at this time.
|
||||
|
|
|
@ -10,7 +10,6 @@ set(SYMCRYPT_TARGET_ENV Linux)
|
|||
|
||||
# Define _AMD64_ to set up the correct SymCrypt macros, e.g. SYMCRYPT_CPU_AMD64
|
||||
add_compile_options(-D_AMD64_)
|
||||
add_compile_options(-DDBG)
|
||||
add_compile_options(-O3)
|
||||
|
||||
# Enable a baseline of features for the compiler to support everywhere
|
||||
|
|
|
@ -1,70 +1,43 @@
|
|||
;/*
|
||||
; C_asm_shared.inc file to synchronize C and Asm information
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
/*
|
||||
C_asm_shared.inc file to synchronize C and Asm information
|
||||
Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
|
||||
; This is a file that compiles both in C and ASM to define values in a way that is guaranteed to be the same on both sides.
|
||||
; We use this to define the structure offsets that the ASM code uses.
|
||||
; By having equivalent C constants we can add checks to the C code to ensure they are correct.
|
||||
;
|
||||
; This is an ugly hack, but it works :-)
|
||||
;
|
||||
; Due to the fact that the ARM assemblers use the C precompiler
|
||||
; the C files have to redefine EQU to nothing before including this file.
|
||||
; */
|
||||
This is a file that is included in both C and ASM such that the values are the same on both sides.
|
||||
We use the C preprocessor to set ASM constants, as we already need to use the C preprocessor for
|
||||
symcryptasm processing (see scripts/symcryptasm_processor.py).
|
||||
We use this to define the structure offsets that the ASM code uses.
|
||||
By having equivalent C constants we can add checks to the C code to ensure they are correct.
|
||||
|
||||
*/
|
||||
|
||||
;const SIZE_T
|
||||
SymCryptModulusNdigitsOffsetAmd64 EQU 4;
|
||||
#if defined(SYMCRYPT_MASM)
|
||||
#define SET(_variable, _value) _variable EQU _value
|
||||
#elif defined(SYMCRYPT_GAS)
|
||||
#define SET(_variable, _value) .set _variable, _value
|
||||
#else // assume C
|
||||
#define SET(_variable, _value) const SIZE_T _variable = _value;
|
||||
#endif
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusMontgomeryInv64OffsetAmd64 EQU 32;
|
||||
SET(SymCryptModulusNdigitsOffsetAmd64, 4);
|
||||
SET(SymCryptModulusMontgomeryInv64OffsetAmd64, 32);
|
||||
SET(SymCryptModulusValueOffsetAmd64, 128);
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusValueOffsetAmd64 EQU 128;
|
||||
SET(SymCryptModulusNdigitsOffsetX86, 4);
|
||||
SET(SymCryptModulusMontgomeryInv64OffsetX86, 24);
|
||||
SET(SymCryptModulusValueOffsetX86, 96);
|
||||
|
||||
SET(SymCryptModulusNdigitsOffsetArm64, 4);
|
||||
SET(SymCryptModulusMontgomeryInv64OffsetArm64, 32);
|
||||
SET(SymCryptModulusValueOffsetArm64, 128);
|
||||
|
||||
SET(SymCryptModulusNdigitsOffsetArm, 4);
|
||||
SET(SymCryptModulusMontgomeryInv64OffsetArm, 24);
|
||||
SET(SymCryptModulusValueOffsetArm, 96);
|
||||
|
||||
|
||||
;const SIZE_T
|
||||
SymCryptModulusNdigitsOffsetX86 EQU 4;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusMontgomeryInv64OffsetX86 EQU 24;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusValueOffsetX86 EQU 96;
|
||||
|
||||
|
||||
|
||||
|
||||
;const SIZE_T
|
||||
SymCryptModulusNdigitsOffsetArm64 EQU 4;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusMontgomeryInv64OffsetArm64 EQU 32;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusValueOffsetArm64 EQU 128;
|
||||
|
||||
|
||||
|
||||
|
||||
;const SIZE_T
|
||||
SymCryptModulusNdigitsOffsetArm EQU 4;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusMontgomeryInv64OffsetArm EQU 24;
|
||||
|
||||
; const SIZE_T
|
||||
SymCryptModulusValueOffsetArm EQU 96;
|
||||
|
||||
|
||||
|
||||
|
||||
; /*
|
||||
IF 0
|
||||
; */
|
||||
#undef EQU
|
||||
#if !defined(SYMCRYPT_MASM) && !defined(SYMCRYPT_GAS)
|
||||
// Preserve the definition of SET for use in symcryptasm processing
|
||||
#undef SET
|
||||
#endif
|
||||
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
#define SYMCRYPT_CHECK_ASM_OFFSETS \
|
||||
|
@ -89,14 +62,9 @@ SymCryptModulusValueOffsetArm EQU 96;
|
|||
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusNdigitsOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, nDigits ) );\
|
||||
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusMontgomeryInv64OffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, tm.montgomery.inv64 ));\
|
||||
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusValueOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, Divisor.Int.ti.fdef.uint32 ));\
|
||||
|
||||
|
||||
#endif // CPU_*
|
||||
|
||||
#if !defined( SYMCRYPT_CHECK_ASM_OFFSETS)
|
||||
#define SYMCRYPT_CHECK_ASM_OFFSETS
|
||||
#endif
|
||||
|
||||
|
||||
; /*
|
||||
ENDIF
|
||||
; */
|
|
@ -96,42 +96,148 @@ set(SOURCES_COMMON
|
|||
IEEE802_11SaeCustom.c
|
||||
)
|
||||
|
||||
function(process_cppasm filepath outformat archdefine)
|
||||
get_filename_component(fileextension ${filepath} EXT)
|
||||
if(NOT fileextension STREQUAL .cppasm)
|
||||
message(FATAL_ERROR "cppasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
|
||||
endif()
|
||||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
|
||||
endif()
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
|
||||
string(TOUPPER ${outformat} outformatupper)
|
||||
string(TOUPPER ${archdefine} archdefineupper)
|
||||
string(FIND ${rootpath} ${CMAKE_CURRENT_BINARY_DIR} findindex) # check whether input is in the output directory
|
||||
if(findindex EQUAL -1) # input in the source directory
|
||||
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
|
||||
set(output_pass2 ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}/${filestem}-${outformat}.asm)
|
||||
else() # input in the output directory
|
||||
set(output_directory ${rootpath})
|
||||
set(output_pass2 ${rootpath}/${filestem}.asm)
|
||||
endif()
|
||||
|
||||
set(dbg_definition "")
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
set(dbg_definition "-DDBG=1")
|
||||
endif()
|
||||
|
||||
if(outformat STREQUAL gas)
|
||||
# assume gas => GCC compatible C compiler
|
||||
add_custom_command(
|
||||
OUTPUT ${output_pass2}
|
||||
COMMAND "${CMAKE_C_COMPILER}" -E -P -x c ${filepath} -o ${output_pass2}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
|
||||
-DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
|
||||
MAIN_DEPENDENCY ${filepath}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
|
||||
COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
|
||||
VERBATIM)
|
||||
elseif(outformat STREQUAL masm)
|
||||
# assume masm => MSVC C compiler
|
||||
add_custom_command(
|
||||
OUTPUT ${output_pass2}
|
||||
COMMAND "${CMAKE_C_COMPILER}" /EP /P /Fi${output_pass2} ${filepath}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
|
||||
-DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
|
||||
MAIN_DEPENDENCY ${filepath}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
|
||||
COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
|
||||
VERBATIM)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(process_symcryptasm filepath outformat archdefine)
|
||||
get_filename_component(fileextension ${filepath} EXT)
|
||||
if(NOT fileextension STREQUAL .symcryptasm)
|
||||
message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
|
||||
endif()
|
||||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
|
||||
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
|
||||
set(output_directory ${CMAKE_CURRENT_BINARY_DIR}/${rootpath})
|
||||
set(output_cppasm ${output_directory}/${filestem}-${outformat}.cppasm)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${output_cppasm}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
|
||||
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
|
||||
MAIN_DEPENDENCY ${filepath}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
|
||||
COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
|
||||
VERBATIM)
|
||||
|
||||
process_cppasm(${output_cppasm} ${outformat} ${archdefine})
|
||||
endfunction()
|
||||
|
||||
if(NOT WIN32)
|
||||
list(APPEND SOURCES_COMMON linux/intrinsics.c)
|
||||
list(APPEND SOURCES_COMMON linux/asmstubs.c)
|
||||
endif()
|
||||
|
||||
if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
amd64/aesasm.asm
|
||||
amd64/fdef_asm.asm
|
||||
amd64/fdef_mulx.asm
|
||||
amd64/fdef369_asm.asm
|
||||
amd64/sha1asm.asm
|
||||
amd64/wipe.asm)
|
||||
amd64/aesasm-masm.asm
|
||||
amd64/fdef_asm-masm.asm
|
||||
amd64/fdef369_asm-masm.asm
|
||||
amd64/fdef_mulx-masm.asm
|
||||
amd64/wipe-masm.asm)
|
||||
set_source_files_properties(
|
||||
amd64/aesasm.asm
|
||||
amd64/fdef_asm.asm
|
||||
amd64/fdef_mulx.asm
|
||||
amd64/fdef369_asm.asm
|
||||
amd64/sha1asm.asm
|
||||
amd64/wipe.asm
|
||||
amd64/aesasm-masm.asm
|
||||
amd64/fdef_asm-masm.asm
|
||||
amd64/fdef369_asm-masm.asm
|
||||
amd64/fdef_mulx-masm.asm
|
||||
amd64/wipe-masm.asm
|
||||
PROPERTY LANGUAGE ASM_MASM)
|
||||
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "X86")
|
||||
process_cppasm(i386/fdef_asm.cppasm masm x86)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
i386/aesasm.asm
|
||||
i386/fdef_asm.asm
|
||||
i386/rc4asm.asm
|
||||
i386/sha1asm.asm
|
||||
i386/fdef_asm-masm.asm
|
||||
i386/wipe.asm)
|
||||
set_source_files_properties(
|
||||
i386/aesasm.asm
|
||||
i386/fdef_asm.asm
|
||||
i386/rc4asm.asm
|
||||
i386/sha1asm.asm
|
||||
i386/fdef_asm-masm.asm
|
||||
i386/wipe.asm
|
||||
PROPERTY LANGUAGE ASM_MASM)
|
||||
set_source_files_properties(
|
||||
i386/fdef_asm-masm.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/i386)
|
||||
endif()
|
||||
else()
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
amd64/aesasm-gas.asm
|
||||
amd64/fdef_asm-gas.asm
|
||||
amd64/fdef369_asm-gas.asm
|
||||
amd64/fdef_mulx-gas.asm
|
||||
amd64/wipe-gas.asm)
|
||||
set_source_files_properties(
|
||||
amd64/aesasm-gas.asm
|
||||
amd64/fdef_asm-gas.asm
|
||||
amd64/fdef369_asm-gas.asm
|
||||
amd64/fdef_mulx-gas.asm
|
||||
amd64/wipe-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
108
lib/a_dispatch.c
108
lib/a_dispatch.c
|
@ -22,7 +22,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
|
|||
SYMCRYPT_MOD_FUNCTIONS_FDEF_GENERIC, // Handles any type of modulus
|
||||
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY, // Montgomery, only for odd parity-public moduli
|
||||
|
||||
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
|
||||
SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY, // optimized for 384 and 576-bit moduli
|
||||
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY256, // Special faster code for 256-bit Montgomery moduli
|
||||
|
@ -55,12 +55,12 @@ const UINT32 g_SymCryptModFnsMask = sizeof( g_SymCryptModFns ) - sizeof( g_SymCr
|
|||
//
|
||||
// Tweaking the selection & function tables allows different tradeoffs of performance vs codesize
|
||||
//
|
||||
SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
|
||||
SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
|
||||
{
|
||||
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
// Mulx used for 257-512 and 577-... bits
|
||||
{('2M' << 16) + SymCryptModFntableMontgomery256, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('5M' << 16) + SymCryptModFntableMontgomery512, 0, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 576, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
|
@ -118,9 +118,9 @@ SymCryptSizeofIntFromDigits( UINT32 nDigits )
|
|||
|
||||
PSYMCRYPT_INT
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
SymCryptIntCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefIntCreate( pbBuffer, cbBuffer, nDigits );
|
||||
|
@ -138,8 +138,8 @@ SymCryptIntWipe( _Out_ PSYMCRYPT_INT piDst )
|
|||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntCopy(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
SymCryptIntCopy(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
_Out_ PSYMCRYPT_INT piDst )
|
||||
{
|
||||
SymCryptFdefIntCopy( piSrc, piDst );
|
||||
|
@ -191,8 +191,8 @@ SymCryptIntDigitsizeOfObject( _In_ PCSYMCRYPT_INT piSrc )
|
|||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntCopyMixedSize(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
SymCryptIntCopyMixedSize(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
_Out_ PSYMCRYPT_INT piDst )
|
||||
{
|
||||
return SymCryptFdefIntCopyMixedSize( piSrc, piDst );
|
||||
|
@ -207,8 +207,8 @@ SymCryptIntBitsizeOfValue( _In_ PCSYMCRYPT_INT piSrc )
|
|||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntSetValueUint32(
|
||||
UINT32 u32Src,
|
||||
SymCryptIntSetValueUint32(
|
||||
UINT32 u32Src,
|
||||
_Out_ PSYMCRYPT_INT piDst )
|
||||
{
|
||||
SymCryptFdefIntSetValueUint32( u32Src, piDst );
|
||||
|
@ -216,8 +216,8 @@ SymCryptIntSetValueUint32(
|
|||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntSetValueUint64(
|
||||
UINT64 u64Src,
|
||||
SymCryptIntSetValueUint64(
|
||||
UINT64 u64Src,
|
||||
_Out_ PSYMCRYPT_INT piDst )
|
||||
{
|
||||
SymCryptFdefIntSetValueUint64( u64Src, piDst );
|
||||
|
@ -225,10 +225,10 @@ SymCryptIntSetValueUint64(
|
|||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntSetValue(
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
SymCryptIntSetValue(
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_ PSYMCRYPT_INT piDst )
|
||||
{
|
||||
return SymCryptFdefIntSetValue( pbSrc, cbSrc, format, piDst );
|
||||
|
@ -236,10 +236,10 @@ SymCryptIntSetValue(
|
|||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntGetValue(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SymCryptIntGetValue(
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format )
|
||||
{
|
||||
return SymCryptFdefIntGetValue( piSrc, pbDst, cbDst, format );
|
||||
|
@ -496,9 +496,9 @@ SymCryptSizeofDivisorFromDigits( UINT32 nDigits )
|
|||
|
||||
PSYMCRYPT_DIVISOR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptDivisorCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
SymCryptDivisorCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefDivisorCreate( pbBuffer, cbBuffer, nDigits );
|
||||
|
@ -514,8 +514,8 @@ SymCryptDivisorWipe( _Out_ PSYMCRYPT_DIVISOR pdObj )
|
|||
}
|
||||
|
||||
VOID
|
||||
SymCryptDivisorCopy(
|
||||
_In_ PCSYMCRYPT_DIVISOR pdSrc,
|
||||
SymCryptDivisorCopy(
|
||||
_In_ PCSYMCRYPT_DIVISOR pdSrc,
|
||||
_Out_ PSYMCRYPT_DIVISOR pdDst )
|
||||
{
|
||||
SymCryptFdefDivisorCopy( pdSrc, pdDst );
|
||||
|
@ -585,9 +585,9 @@ SymCryptSizeofModulusFromDigits( UINT32 nDigits )
|
|||
|
||||
PSYMCRYPT_MODULUS
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModulusCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
SymCryptModulusCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefModulusCreate( pbBuffer, cbBuffer, nDigits );
|
||||
|
@ -604,7 +604,7 @@ SymCryptModulusWipe( _Out_ PSYMCRYPT_MODULUS pmObj )
|
|||
|
||||
VOID
|
||||
SymCryptModulusCopy(
|
||||
_In_ PCSYMCRYPT_MODULUS pmSrc,
|
||||
_In_ PCSYMCRYPT_MODULUS pmSrc,
|
||||
_Out_ PSYMCRYPT_MODULUS pmDst )
|
||||
{
|
||||
SymCryptFdefModulusCopy( pmSrc, pmDst );
|
||||
|
@ -626,8 +626,8 @@ SymCryptModElementAllocate( _In_ PCSYMCRYPT_MODULUS pmMod )
|
|||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementFree(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
SymCryptModElementFree(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peObj )
|
||||
{
|
||||
SymCryptFdefModElementFree( pmMod, peObj );
|
||||
|
@ -642,9 +642,9 @@ SymCryptSizeofModElementFromModulus( PCSYMCRYPT_MODULUS pmMod )
|
|||
|
||||
PSYMCRYPT_MODELEMENT
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
SymCryptModElementCreate(
|
||||
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
|
||||
SIZE_T cbBuffer,
|
||||
PCSYMCRYPT_MODULUS pmMod )
|
||||
{
|
||||
return SymCryptFdefModElementCreate( pbBuffer, cbBuffer, pmMod );
|
||||
|
@ -660,9 +660,9 @@ SymCryptModElementWipe(
|
|||
}
|
||||
|
||||
VOID
|
||||
SymCryptModElementCopy(
|
||||
SymCryptModElementCopy(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst )
|
||||
{
|
||||
SymCryptFdefModElementCopy( pmMod, peSrc, peDst );
|
||||
|
@ -671,7 +671,7 @@ SymCryptModElementCopy(
|
|||
VOID
|
||||
SymCryptModElementMaskedCopy(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
UINT32 mask )
|
||||
{
|
||||
|
@ -753,7 +753,7 @@ SymCryptModElementToInt(
|
|||
PCUINT32 pData;
|
||||
|
||||
SYMCRYPT_ASSERT( piDst->nDigits >= pmMod->nDigits );
|
||||
|
||||
|
||||
pData = SYMCRYPT_MOD_CALL( pmMod ) modPreGet( pmMod, peSrc, pbScratch, cbScratch );
|
||||
|
||||
SymCryptFdefModElementToIntGeneric( pmMod, pData, piDst, pbScratch, cbScratch );
|
||||
|
@ -762,17 +762,17 @@ SymCryptModElementToInt(
|
|||
SYMCRYPT_DISABLE_CFG
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementSetValue(
|
||||
_In_reads_bytes_( cbSrc ) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
SymCryptModElementSetValue(
|
||||
_In_reads_bytes_( cbSrc ) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
PCSYMCRYPT_MODULUS pmMod,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch )
|
||||
{
|
||||
SYMCRYPT_ERROR scError;
|
||||
|
||||
|
||||
scError = SymCryptFdefModElementSetValueGeneric( pbSrc, cbSrc, format, pmMod, peDst, pbScratch, cbScratch );
|
||||
|
||||
if( scError == SYMCRYPT_NO_ERROR )
|
||||
|
@ -785,11 +785,11 @@ SymCryptModElementSetValue(
|
|||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementGetValue(
|
||||
SymCryptModElementGetValue(
|
||||
PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc,
|
||||
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch )
|
||||
|
@ -889,8 +889,8 @@ SymCryptModNeg(
|
|||
SYMCRYPT_DISABLE_CFG
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementSetValueUint32(
|
||||
UINT32 value,
|
||||
SymCryptModElementSetValueUint32(
|
||||
UINT32 value,
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
|
@ -903,8 +903,8 @@ SymCryptModElementSetValueUint32(
|
|||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptModElementSetValueNegUint32(
|
||||
UINT32 value,
|
||||
SymCryptModElementSetValueNegUint32(
|
||||
UINT32 value,
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
|
@ -994,7 +994,7 @@ SymCryptCreateTrialDivisionContext( UINT32 nDigits )
|
|||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptIntFindSmallDivisor(
|
||||
SymCryptIntFindSmallDivisor(
|
||||
_In_ PCSYMCRYPT_TRIALDIVISION_CONTEXT pContext,
|
||||
_In_ PCSYMCRYPT_INT piSrc,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
|
|
1657
lib/amd64/aesasm.asm
1657
lib/amd64/aesasm.asm
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,964 @@
|
|||
//
|
||||
// aesasm.symcryptasm Assembler code for fast AES on the amd64
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
// This code is derived from the AesFast implementation that
|
||||
// Niels Ferguson wrote from scratch for BitLocker during Vista.
|
||||
// That code is still in RSA32.
|
||||
//
|
||||
|
||||
// This file has only been partially translated into symcryptasm, external function calls use the
|
||||
// generic symcryptasm registers to convert different calling conventions into using the fixed register
|
||||
// layout used in aesasm. It seems likely that changing which registers AES state will be kept in in
|
||||
// the macros could impact on performance.
|
||||
// In general we don't want to touch this code going forward; the vast majority of amd64 CPUs have aesni
|
||||
// and use the Xmm Aes codepaths.
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
#include "symcrypt_version.inc"
|
||||
|
||||
#define USE_BLOCK_FUNCTION 1 // Set to 1 to use block function, 0 to use block macro
|
||||
|
||||
#if defined(SYMCRYPT_MASM)
|
||||
extern SymCryptAesSboxMatrixMult:DWORD
|
||||
extern SymCryptAesInvSboxMatrixMult:DWORD
|
||||
extern SymCryptAesInvSbox:BYTE
|
||||
extern SymCryptFatal:NEAR
|
||||
|
||||
#elif defined(SYMCRYPT_GAS)
|
||||
|
||||
#else
|
||||
#error Unknown target assembly
|
||||
#endif
|
||||
|
||||
#if DBG
|
||||
SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR ))
|
||||
SET(SYMCRYPT_MAGIC_CONSTANT, (HEX(53316D76) + SYMCRYPT_CODE_VERSION)) // 0x53316D76 == 'S1mv'
|
||||
|
||||
MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
|
||||
mov rax, [ptr + struct_magic_offset]
|
||||
sub rax, ptr
|
||||
cmp rax, SYMCRYPT_MAGIC_CONSTANT
|
||||
jz check_magic_label
|
||||
mov arg_1, HEX(6D616763) // 0x6D616763 == 'magc'
|
||||
call SymCryptFatal
|
||||
check_magic_label:
|
||||
MACRO_END()
|
||||
#else
|
||||
MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
|
||||
MACRO_END()
|
||||
#endif
|
||||
|
||||
//
|
||||
// Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure.
|
||||
//
|
||||
|
||||
// SYMCRYPT_AES_EXPANDED_KEY struct
|
||||
// RoundKey dq 2*N_ROUND_KEYS_IN_AESKEY dup (?) //
|
||||
// lastEncRoundKey dq ? // pointer to last enc round key
|
||||
// lastDecRoundKey dq ? // pointer to last dec round key
|
||||
// SYMCRYPT_MAGIC_FIELD
|
||||
// SYMCRYPT_AES_EXPANDED_KEY ends
|
||||
|
||||
SET(N_ROUND_KEYS_IN_AESKEY, 29)
|
||||
SET(lastEncRoundKeyOffset, (29*16))
|
||||
SET(lastDecRoundKeyOffset, (29*16 + 8))
|
||||
SET(magicFieldOffset, (29*16 + 8 + 8))
|
||||
|
||||
//
|
||||
// Shorthand for the 4 tables we will use
|
||||
// We always use r11 to point to the (inv) SboxMatrixMult tables
|
||||
//
|
||||
#define SMM0 (r11 + 0)
|
||||
#define SMM1 (r11 + 1024)
|
||||
#define SMM2 (r11 + 2048)
|
||||
#define SMM3 (r11 + 3072)
|
||||
|
||||
#define ISMM0 (r11 + 0)
|
||||
#define ISMM1 (r11 + 1024)
|
||||
#define ISMM2 (r11 + 2048)
|
||||
#define ISMM3 (r11 + 3072)
|
||||
|
||||
MACRO_START(ENC_MIX, keyptr)
|
||||
//
|
||||
// Perform the unkeyed mixing function for encryption
|
||||
// plus a key addition from the key pointer
|
||||
//
|
||||
// input:block is in eax, ebx, ecx, edx - r11 points to AesSboxMatrixMult
|
||||
// New state ends up in eax, ebx, ecx, edx
|
||||
// Used registers: esi, edi, ebp, r8
|
||||
|
||||
//
|
||||
// We can use the e<xx> registers for the movzx as the
|
||||
// upper 32 bits are automatically set to 0. This saves
|
||||
// prefix bytes
|
||||
//
|
||||
// We use 32-bit registers to store the state.
|
||||
// We tried using 64-bit registers, but the extra shifts
|
||||
// cost too much.
|
||||
// Using 32-bit throughout makes the key xor more expensive
|
||||
// but we avoid having to combine the 32-bit halves into
|
||||
// 64 bit.
|
||||
//
|
||||
|
||||
movzx esi,al
|
||||
mov esi,[SMM0 + 4 * rsi]
|
||||
movzx edi,ah
|
||||
shr eax,16
|
||||
mov r8d,[SMM1 + 4 * rdi]
|
||||
movzx ebp,al
|
||||
mov ebp,[SMM2 + 4 * rbp]
|
||||
movzx edi,ah
|
||||
mov edi,[SMM3 + 4 * rdi]
|
||||
|
||||
movzx eax,bl
|
||||
xor edi,[SMM0 + 4 * rax]
|
||||
movzx eax,bh
|
||||
shr ebx,16
|
||||
xor esi,[SMM1 + 4 * rax]
|
||||
movzx eax,bl
|
||||
xor r8d,[SMM2 + 4 * rax]
|
||||
movzx eax,bh
|
||||
xor ebp,[SMM3 + 4 * rax]
|
||||
|
||||
movzx eax,cl
|
||||
xor ebp,[SMM0 + 4 * rax]
|
||||
movzx ebx,ch
|
||||
shr ecx,16
|
||||
xor edi,[SMM1 + 4 * rbx]
|
||||
movzx eax,cl
|
||||
xor esi,[SMM2 + 4 * rax]
|
||||
movzx ebx,ch
|
||||
xor r8d,[SMM3 + 4 * rbx]
|
||||
|
||||
movzx eax,dl
|
||||
xor r8d,[SMM0 + 4 * rax]
|
||||
movzx ebx,dh
|
||||
shr edx,16
|
||||
xor ebp,[SMM1 + 4 * rbx]
|
||||
movzx eax,dl
|
||||
xor edi,[SMM2 + 4 * rax]
|
||||
movzx ebx,dh
|
||||
xor esi,[SMM3 + 4 * rbx]
|
||||
|
||||
mov eax, [keyptr]
|
||||
mov ebx, [keyptr + 4]
|
||||
xor eax, esi
|
||||
mov ecx, [keyptr + 8]
|
||||
xor ebx, edi
|
||||
mov edx, [keyptr + 12]
|
||||
xor ecx, ebp
|
||||
xor edx, r8d
|
||||
MACRO_END()
|
||||
|
||||
|
||||
MACRO_START(DEC_MIX, keyptr)
|
||||
//
|
||||
// Perform the unkeyed mixing function for decryption
|
||||
//
|
||||
// input:block is in eax, ebx, ecx, edx
|
||||
// r11 points to AesInvSboxMatrixMult
|
||||
// New state ends up in esi, edi, ebp, r8d
|
||||
|
||||
movzx esi,al
|
||||
mov esi,[ISMM0 + 4 * rsi]
|
||||
movzx edi,ah
|
||||
shr eax,16
|
||||
mov edi,[ISMM1 + 4 * rdi]
|
||||
movzx ebp,al
|
||||
mov ebp,[ISMM2 + 4 * rbp]
|
||||
movzx eax,ah
|
||||
mov r8d,[ISMM3 + 4 * rax]
|
||||
|
||||
movzx eax,bl
|
||||
xor edi,[ISMM0 + 4 * rax]
|
||||
movzx eax,bh
|
||||
shr ebx,16
|
||||
xor ebp,[ISMM1 + 4 * rax]
|
||||
movzx eax,bl
|
||||
xor r8d,[ISMM2 + 4 * rax]
|
||||
movzx eax,bh
|
||||
xor esi,[ISMM3 + 4 * rax]
|
||||
|
||||
movzx eax,cl
|
||||
xor ebp,[ISMM0 + 4 * rax]
|
||||
movzx ebx,ch
|
||||
shr ecx,16
|
||||
xor r8d,[ISMM1 + 4 * rbx]
|
||||
movzx eax,cl
|
||||
xor esi,[ISMM2 + 4 * rax]
|
||||
movzx ebx,ch
|
||||
xor edi,[ISMM3 + 4 * rbx]
|
||||
|
||||
movzx eax,dl
|
||||
xor r8d,[ISMM0 + 4 * rax]
|
||||
movzx ebx,dh
|
||||
shr edx,16
|
||||
xor esi,[ISMM1 + 4 * rbx]
|
||||
movzx eax,dl
|
||||
xor edi,[ISMM2 + 4 * rax]
|
||||
movzx ebx,dh
|
||||
xor ebp,[ISMM3 + 4 * rbx]
|
||||
|
||||
mov eax, [keyptr]
|
||||
mov ebx, [keyptr + 4]
|
||||
xor eax, esi
|
||||
mov ecx, [keyptr + 8]
|
||||
xor ebx, edi
|
||||
mov edx, [keyptr + 12]
|
||||
xor ecx, ebp
|
||||
xor edx, r8d
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_ENCRYPT_MACRO, AesEncryptMacroLoopLabel)
|
||||
//
|
||||
// Plaintext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use (modified)
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
//
|
||||
// This macro is free to unroll the cipher completely, or to use a loop
|
||||
// over r9
|
||||
//
|
||||
|
||||
//
|
||||
// xor in first round key
|
||||
//
|
||||
xor eax,[r9]
|
||||
xor ebx,[r9+4]
|
||||
xor ecx,[r9+8]
|
||||
xor edx,[r9+12]
|
||||
|
||||
add r9,32
|
||||
|
||||
// Do not unroll the loop at all because very few CPUs use this codepath so it's worth
|
||||
// minimizing the binary size
|
||||
|
||||
AesEncryptMacroLoopLabel:
|
||||
// Block is eax, ebx, ecx, edx
|
||||
// r9-16 points to next round key
|
||||
|
||||
ENC_MIX r9-16
|
||||
|
||||
cmp r9,r10
|
||||
lea r9,[r9+16]
|
||||
jc AesEncryptMacroLoopLabel
|
||||
|
||||
//
|
||||
// Now for the final round
|
||||
// We use the fact that SboxMatrixMult[0] table is also
|
||||
// an Sbox table if you use the second element of each entry.
|
||||
//
|
||||
// Result is in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
movzx esi,al
|
||||
movzx esi,byte ptr[r11 + 1 + 4*rsi]
|
||||
movzx edi,ah
|
||||
shr eax,16
|
||||
movzx r8d,byte ptr[r11 + 1 + 4*rdi]
|
||||
movzx ebp,al
|
||||
shl r8d,8
|
||||
movzx ebp,byte ptr[r11 + 1 + 4*rbp]
|
||||
shl ebp,16
|
||||
movzx edi,ah
|
||||
movzx edi,byte ptr[r11 + 1 + 4*rdi]
|
||||
shl edi,24
|
||||
|
||||
movzx eax,bl
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
or edi,eax
|
||||
movzx eax,bh
|
||||
shr ebx,16
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
shl eax,8
|
||||
or esi,eax
|
||||
movzx eax,bl
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
movzx ebx,bh
|
||||
shl eax,16
|
||||
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
|
||||
or r8d,eax
|
||||
shl ebx,24
|
||||
or ebp,ebx
|
||||
|
||||
movzx eax,cl
|
||||
movzx ebx,ch
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
shr ecx,16
|
||||
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
|
||||
shl ebx,8
|
||||
or ebp,eax
|
||||
or edi,ebx
|
||||
movzx eax,cl
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
movzx ebx,ch
|
||||
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
|
||||
shl eax,16
|
||||
shl ebx,24
|
||||
or esi,eax
|
||||
or r8d,ebx
|
||||
|
||||
movzx eax,dl
|
||||
movzx ebx,dh
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
shr edx,16
|
||||
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
|
||||
shl ebx,8
|
||||
or r8d,eax
|
||||
or ebp,ebx
|
||||
movzx eax,dl
|
||||
movzx eax,byte ptr[r11 + 1 + 4*rax]
|
||||
movzx ebx,dh
|
||||
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
|
||||
shl eax,16
|
||||
shl ebx,24
|
||||
or edi,eax
|
||||
or esi,ebx
|
||||
|
||||
//
|
||||
// xor in final round key
|
||||
//
|
||||
|
||||
xor r8d,[r10+12]
|
||||
xor esi,[r10]
|
||||
xor edi,[r10+4]
|
||||
xor ebp,[r10+8]
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_DECRYPT_MACRO, AesDecryptMacroLoopLabel)
|
||||
//
|
||||
// Ciphertext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to InvSboxMatrixMult (unchanged)
|
||||
// r12 points to InvSbox (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
//
|
||||
// xor in first round key
|
||||
//
|
||||
xor eax,[r9]
|
||||
xor ebx,[r9+4]
|
||||
xor ecx,[r9+8]
|
||||
xor edx,[r9+12]
|
||||
|
||||
add r9,32
|
||||
|
||||
// Do not unroll the loop at all because very few CPUs use this codepath so it's worth
|
||||
// minimizing the binary size
|
||||
AesDecryptMacroLoopLabel:
|
||||
// Block is eax, ebx, ecx, edx
|
||||
// r9-16 points to next round key
|
||||
|
||||
DEC_MIX r9-16
|
||||
|
||||
cmp r9,r10
|
||||
lea r9,[r9+16]
|
||||
jc AesDecryptMacroLoopLabel
|
||||
|
||||
//
|
||||
// Now for the final round
|
||||
// Result is in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
movzx esi,al
|
||||
movzx esi,byte ptr[r12 + rsi]
|
||||
movzx edi,ah
|
||||
shr eax,16
|
||||
movzx edi,byte ptr[r12 + rdi]
|
||||
movzx ebp,al
|
||||
shl edi,8
|
||||
movzx ebp,byte ptr[r12 + rbp]
|
||||
shl ebp,16
|
||||
movzx eax,ah
|
||||
movzx r8d,byte ptr[r12 + rax]
|
||||
shl r8d,24
|
||||
|
||||
movzx eax,bl
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
or edi,eax
|
||||
movzx eax,bh
|
||||
shr ebx,16
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
shl eax,8
|
||||
or ebp,eax
|
||||
movzx eax,bl
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
movzx ebx,bh
|
||||
shl eax,16
|
||||
movzx ebx,byte ptr[r12 + rbx]
|
||||
or r8d,eax
|
||||
shl ebx,24
|
||||
or esi,ebx
|
||||
|
||||
movzx eax,cl
|
||||
movzx ebx,ch
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
shr ecx,16
|
||||
movzx ebx,byte ptr[r12 + rbx]
|
||||
shl ebx,8
|
||||
or ebp,eax
|
||||
or r8d,ebx
|
||||
movzx eax,cl
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
movzx ebx,ch
|
||||
movzx ebx,byte ptr[r12 + rbx]
|
||||
shl eax,16
|
||||
shl ebx,24
|
||||
or esi,eax
|
||||
or edi,ebx
|
||||
|
||||
movzx eax,dl
|
||||
movzx ebx,dh
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
shr edx,16
|
||||
movzx ebx,byte ptr[r12 + rbx]
|
||||
shl ebx,8
|
||||
or r8d,eax
|
||||
or esi,ebx
|
||||
movzx eax,dl
|
||||
movzx eax,byte ptr[r12 + rax]
|
||||
movzx ebx,dh
|
||||
movzx ebx,byte ptr[r12 + rbx]
|
||||
shl eax,16
|
||||
shl ebx,24
|
||||
or edi,eax
|
||||
or ebp,ebx
|
||||
|
||||
//
|
||||
// xor in final round key
|
||||
//
|
||||
|
||||
xor esi,[r10]
|
||||
xor edi,[r10+4]
|
||||
xor ebp,[r10+8]
|
||||
xor r8d,[r10+12]
|
||||
MACRO_END()
|
||||
|
||||
#if USE_BLOCK_FUNCTION
|
||||
|
||||
//
|
||||
// We use a block function, the AES_ENCRYPT macro merely calls the function
|
||||
//
|
||||
|
||||
MACRO_START(AES_ENCRYPT, loopLabel)
|
||||
call SymCryptAesEncryptAsmInternal
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_DECRYPT, loopLabel)
|
||||
call SymCryptAesDecryptAsmInternal
|
||||
MACRO_END()
|
||||
|
||||
//========================================
|
||||
// SymCryptAesEncryptAsmInternal
|
||||
//
|
||||
// Internal AES encryption routine with modified calling convention.
|
||||
// This function has the exact same calling convention as the AES_ENCRYPT_MACRO
|
||||
|
||||
FUNCTION_START(SymCryptAesEncryptAsmInternal, 0, 0)
|
||||
|
||||
AES_ENCRYPT_MACRO SymCryptAesEncryptAsmInternalLoop
|
||||
|
||||
FUNCTION_END(SymCryptAesEncryptAsmInternal)
|
||||
|
||||
//========================================
|
||||
// SymCryptAesDecryptAsmInternal
|
||||
//
|
||||
// Internal AES encryption routine with modified calling convention.
|
||||
// This function has the exact same calling convention as the AES_DECRYPT_MACRO
|
||||
//
|
||||
|
||||
FUNCTION_START(SymCryptAesDecryptAsmInternal, 0, 0)
|
||||
|
||||
AES_DECRYPT_MACRO SymCryptAesDecryptAsmInternalLoop
|
||||
|
||||
FUNCTION_END(SymCryptAesDecryptAsmInternal)
|
||||
|
||||
#else
|
||||
|
||||
//
|
||||
// No block function, use the macro directly
|
||||
//
|
||||
|
||||
MACRO_START(AES_ENCRYPT, loopLabel)
|
||||
AES_ENCRYPT_MACRO loopLabel
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_DECRYPT, loopLabel)
|
||||
AES_DECRYPT_MACRO loopLabel
|
||||
MACRO_END()
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext,
|
||||
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext )
|
||||
//
|
||||
|
||||
NESTED_FUNCTION_START(SymCryptAesEncryptAsm, 3, 15)
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC SymCryptAesEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
|
||||
|
||||
// Here we convert from whatever calling convention we are called from externally to our
|
||||
// AES internal calling convention.
|
||||
// We need to be careful that we don't overwrite an argument register before we copy it to
|
||||
// the place it is needed internally in the AES functions.
|
||||
// There is no automatic method for checking we do this correctly - modify with care!
|
||||
// In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
|
||||
// rcx, rdx, r8, rdi, rsi
|
||||
|
||||
mov r10, [Q1 + lastEncRoundKeyOffset]
|
||||
mov r9, Q1
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
|
||||
|
||||
//
|
||||
// Load the plaintext
|
||||
//
|
||||
mov eax,[Q2 ]
|
||||
mov ebx,[Q2 + 4]
|
||||
mov ecx,[Q2 + 8]
|
||||
mov edx,[Q2 + 12]
|
||||
|
||||
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
|
||||
|
||||
AES_ENCRYPT SymCryptAesEncryptAsmLoop
|
||||
// Plaintext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
|
||||
// retrieve pbCiphertext using Q0 because it is always rax regardless of calling convention
|
||||
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
|
||||
mov [Q0 ], esi
|
||||
mov [Q0 + 4], edi
|
||||
mov [Q0 + 8], ebp
|
||||
mov [Q0 + 12], r8d
|
||||
|
||||
NESTED_FUNCTION_END(SymCryptAesEncryptAsm)
|
||||
|
||||
|
||||
//
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext,
|
||||
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext )
|
||||
|
||||
NESTED_FUNCTION_START(SymCryptAesDecryptAsm, 3, 15)
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC SymCryptAesDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
|
||||
|
||||
// Here we convert from whatever calling convention we are called from externally to our
|
||||
// AES internal calling convention.
|
||||
// We need to be careful that we don't overwrite an argument register before we copy or use
|
||||
// the value appropriately for use in the AES functions.
|
||||
// There is no automatic method for checking we do this correctly - modify with care!
|
||||
// In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
|
||||
// rcx, rdx, r8, rdi, rsi
|
||||
|
||||
mov r9,[Q1 + lastEncRoundKeyOffset]
|
||||
mov r10,[Q1 + lastDecRoundKeyOffset]
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
|
||||
|
||||
mov eax,[Q2 ]
|
||||
mov ebx,[Q2 + 4]
|
||||
mov ecx,[Q2 + 8]
|
||||
mov edx,[Q2 + 12]
|
||||
|
||||
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
|
||||
lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
|
||||
|
||||
AES_DECRYPT SymCryptAesDecryptAsmLoop
|
||||
// Ciphertext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to InvSboxMatrixMult (unchanged)
|
||||
// r12 points to InvSbox (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
|
||||
// retrieve pbPlaintext using Q0 because it is always rax regardless of calling convention
|
||||
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
|
||||
mov [Q0 ], esi
|
||||
mov [Q0 + 4], edi
|
||||
mov [Q0 + 8], ebp
|
||||
mov [Q0 + 12], r8d
|
||||
|
||||
NESTED_FUNCTION_END(SymCryptAesDecryptAsm)
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptAesCbcEncrypt(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
NESTED_FUNCTION_START(SymCryptAesCbcEncryptAsm, 5, 15)
|
||||
|
||||
// Here we convert from whatever calling convention we are called from externally to our
|
||||
// AES internal calling convention.
|
||||
// We need to be careful that we don't overwrite an argument register before we copy or use
|
||||
// the value appropriately for use in the AES functions.
|
||||
// There is no automatic method for checking we do this correctly - modify with care!
|
||||
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
|
||||
// rcx, rdx, r8, r9, r10, rdi, rsi
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC SymCryptAesCbcEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
|
||||
|
||||
and Q5, NOT 15 // only deal with whole # blocks
|
||||
jz SymCryptAesCbcEncryptNoData
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue
|
||||
mov rax, Q2 // rax = pbChainingValue
|
||||
mov r13, Q3 // r13 = pbSrc
|
||||
|
||||
mov r15, Q5 // r15 = cbData
|
||||
mov r14, Q4 // r14 = pbDst
|
||||
|
||||
add r15, Q3 // r15 = pbSrcEnd
|
||||
|
||||
mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key
|
||||
mov r12,Q1 // r12 = first round key to use
|
||||
|
||||
//
|
||||
// Load the chaining state from pbChainingValue
|
||||
//
|
||||
mov esi,[rax ]
|
||||
mov edi,[rax + 4]
|
||||
mov ebp,[rax + 8]
|
||||
mov r8d,[rax + 12]
|
||||
|
||||
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
|
||||
|
||||
ALIGN(16)
|
||||
SymCryptAesCbcEncryptAsmLoop:
|
||||
// Loop register setup
|
||||
// r10 = last round key to use
|
||||
// r12 = first round key to use
|
||||
// r13 = pbSrc
|
||||
// r14 = pbDst
|
||||
// r15 = pbSrcEnd
|
||||
|
||||
// chaining state in (esi,edi,ebp,r8d)
|
||||
|
||||
mov eax, [r13]
|
||||
mov r9, r12
|
||||
mov ebx, [r13+4]
|
||||
xor eax, esi
|
||||
mov ecx, [r13+8]
|
||||
xor ebx, edi
|
||||
xor ecx, ebp
|
||||
mov edx, [r13+12]
|
||||
xor edx, r8d
|
||||
|
||||
add r13, 16
|
||||
|
||||
|
||||
AES_ENCRYPT SymCryptAesCbcEncryptAsmInnerLoop
|
||||
//
|
||||
// Plaintext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
mov [r14], esi
|
||||
mov [r14+4], edi
|
||||
mov [r14+8], ebp
|
||||
mov [r14+12], r8d
|
||||
|
||||
add r14, 16
|
||||
|
||||
cmp r13, r15
|
||||
|
||||
jb SymCryptAesCbcEncryptAsmLoop
|
||||
|
||||
|
||||
//
|
||||
// Update the chaining value
|
||||
//
|
||||
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
|
||||
mov [Q0], esi
|
||||
mov [Q0+4], edi
|
||||
mov [Q0+8], ebp
|
||||
mov [Q0+12], r8d
|
||||
|
||||
SymCryptAesCbcEncryptNoData:
|
||||
|
||||
NESTED_FUNCTION_END(SymCryptAesCbcEncryptAsm)
|
||||
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptAesCbcDecrypt(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
NESTED_FUNCTION_START(SymCryptAesCbcDecryptAsm, 5, 15)
|
||||
|
||||
// Here we convert from whatever calling convention we are called from externally to our
|
||||
// AES internal calling convention.
|
||||
// We need to be careful that we don't overwrite an argument register before we copy or use
|
||||
// the value appropriately for use in the AES functions.
|
||||
// There is no automatic method for checking we do this correctly - modify with care!
|
||||
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
|
||||
// rcx, rdx, r8, r9, r10, rdi, rsi
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC SymCryptAesCbcDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
|
||||
|
||||
and Q5, NOT 15
|
||||
jz SymCryptAesCbcDecryptNoData
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q3 // save pbSrc
|
||||
|
||||
lea r14, [Q5 - 16]
|
||||
lea r15, [Q4 + r14] // r15 = pbDst pointed to last block
|
||||
add r14, Q3 // r14 = pbSrc pointed to last block
|
||||
|
||||
mov r13,[Q1 + lastEncRoundKeyOffset]
|
||||
mov r10,[Q1 + lastDecRoundKeyOffset]
|
||||
|
||||
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
|
||||
lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
|
||||
|
||||
//
|
||||
// Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
|
||||
//
|
||||
mov eax,[r14]
|
||||
mov ebx,[r14+4]
|
||||
mov ecx,[r14+8]
|
||||
mov edx,[r14+12]
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], eax
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot2)+4], ebx
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot3) ], ecx
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot3)+4], edx
|
||||
|
||||
jmp SymCryptAesCbcDecryptAsmLoopEntry
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptAesCbcDecryptAsmLoop:
|
||||
// Loop register setup
|
||||
// r13 = first round key to use
|
||||
// r14 = pbSrc
|
||||
// r15 = pbDst
|
||||
// [slot1] = pbSrcStart
|
||||
|
||||
// current ciphertext block (esi,edi,ebp,r8d)
|
||||
|
||||
mov eax,[r14-16]
|
||||
mov ebx,[r14-12]
|
||||
xor esi,eax
|
||||
mov ecx,[r14-8]
|
||||
xor edi,ebx
|
||||
mov [r15],esi
|
||||
mov edx,[r14-4]
|
||||
xor ebp,ecx
|
||||
mov [r15+4],edi
|
||||
xor r8d,edx
|
||||
mov [r15+8],ebp
|
||||
mov [r15+12],r8d
|
||||
|
||||
sub r14,16
|
||||
sub r15,16
|
||||
|
||||
SymCryptAesCbcDecryptAsmLoopEntry:
|
||||
|
||||
mov r9, r13
|
||||
|
||||
AES_DECRYPT SymCryptAesCbcDecryptAsmInnerLoop
|
||||
//
|
||||
// Ciphertext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to InvSboxMatrixMult (unchanged)
|
||||
// r12 points to InvSbox (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
cmp r14, [rsp + GET_MEMSLOT_OFFSET(slot1)] // pbSrc
|
||||
ja SymCryptAesCbcDecryptAsmLoop
|
||||
|
||||
mov rbx,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingValue
|
||||
xor esi,[rbx]
|
||||
xor edi,[rbx+4]
|
||||
xor ebp,[rbx+8]
|
||||
xor r8d,[rbx+12]
|
||||
|
||||
mov [r15], esi
|
||||
mov [r15+4], edi
|
||||
mov [r15+8], ebp
|
||||
mov [r15+12], r8d
|
||||
|
||||
//
|
||||
// Update the chaining value to the last ciphertext block
|
||||
//
|
||||
mov rax,[rsp + GET_MEMSLOT_OFFSET(slot2)]
|
||||
mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot3)]
|
||||
mov [rbx], rax
|
||||
mov [rbx+8], rcx
|
||||
|
||||
SymCryptAesCbcDecryptNoData:
|
||||
|
||||
NESTED_FUNCTION_END(SymCryptAesCbcDecryptAsm)
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptAesCtrMsb64(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
NESTED_FUNCTION_START(SymCryptAesCtrMsb64Asm, 5, 15)
|
||||
|
||||
// Here we convert from whatever calling convention we are called from externally to our
|
||||
// AES internal calling convention.
|
||||
// We need to be careful that we don't overwrite an argument register before we copy or use
|
||||
// the value appropriately for use in the AES functions.
|
||||
// There is no automatic method for checking we do this correctly - modify with care!
|
||||
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
|
||||
// rcx, rdx, r8, r9, r10, rdi, rsi
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC SymCryptAesCtrMsb64AsmCheckMagic, Q1, magicFieldOffset, Q1
|
||||
|
||||
and Q5, NOT 15 // only deal with whole # blocks
|
||||
jz SymCryptAesCtrMsb64NoData
|
||||
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingState
|
||||
mov rax, Q2 // rax = pbChainingValue
|
||||
mov r13, Q3 // r13 = pbSrc
|
||||
mov r14, Q5 // r14 = cbData
|
||||
mov r15, Q4 // r15 = pbDst
|
||||
add r14, Q3 // r14 = cbData + pbSrc = pbSrcEnd
|
||||
|
||||
mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key
|
||||
mov r12,Q1 // r12 = first round key to use
|
||||
|
||||
|
||||
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
|
||||
|
||||
//
|
||||
// Load the chaining state
|
||||
//
|
||||
mov rcx, [rax + 8]
|
||||
mov rax, [rax ]
|
||||
|
||||
//
|
||||
// Store it in our local copy (we have no register free to keep pbChainingState in)
|
||||
//
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rcx
|
||||
|
||||
//
|
||||
// Move to the right registers
|
||||
//
|
||||
mov rbx, rax
|
||||
mov rdx, rcx
|
||||
shr rbx, 32
|
||||
shr rdx, 32
|
||||
|
||||
ALIGN(16)
|
||||
SymCryptAesCtrMsb64AsmLoop:
|
||||
// Loop invariant
|
||||
// Current chaining state is in (eax, ebx, ecx, edx)
|
||||
// r10 = last round key to use
|
||||
// r11 = SboxMatrixMult
|
||||
// r12 = first round key to use
|
||||
// r13 = pbSrc
|
||||
// r14 = pbSrcEnd
|
||||
// r15 = pbDst
|
||||
// [slot1..slot2] = 16 bytes chaining state block
|
||||
|
||||
mov r9, r12
|
||||
|
||||
AES_ENCRYPT SymCryptAesCtrMsb64AsmInnerLoop
|
||||
//
|
||||
// Plaintext in eax, ebx, ecx, edx
|
||||
// r9 points to first round key to use
|
||||
// r10 is last key to use (unchanged)
|
||||
// r11 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in esi, edi, ebp, r8d
|
||||
//
|
||||
|
||||
// To improve latency, we FIRST
|
||||
// load the chaining state, increment the counter, and write it back.
|
||||
// leave the state in the (eax, ebx, ecx, edx) registers
|
||||
|
||||
mov eax,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 0]
|
||||
mov ebx,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 4]
|
||||
mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot2) ]
|
||||
bswap rcx
|
||||
add rcx, 1
|
||||
bswap rcx
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], rcx
|
||||
mov rdx, rcx
|
||||
shr rdx, 32
|
||||
|
||||
// THEN we process the XOR of the key stream with the data
|
||||
// This order is faster as we need to have the chaining state done
|
||||
// before we can proceed, but there are no dependencies on the data result
|
||||
// So we can loop back to the beginning while the data stream read/writes are
|
||||
// still in flight.
|
||||
//
|
||||
// xor with the source stream
|
||||
|
||||
xor esi,[r13 + 0 ]
|
||||
xor edi,[r13 + 4 ]
|
||||
xor ebp,[r13 + 8 ]
|
||||
xor r8d,[r13 + 12]
|
||||
|
||||
// store at the destination
|
||||
|
||||
mov [r15 + 0], esi
|
||||
mov [r15 + 4], edi
|
||||
mov [r15 + 8], ebp
|
||||
mov [r15 + 12], r8d
|
||||
|
||||
add r13, 16 // pbSrc += 16
|
||||
add r15, 16 // pbDst += 16
|
||||
|
||||
cmp r13, r14
|
||||
|
||||
jb SymCryptAesCtrMsb64AsmLoop
|
||||
|
||||
//
|
||||
// Copy back the chaining value - we only modified the last 8 bytes, so that is all we copy
|
||||
//
|
||||
mov rsi,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingState
|
||||
mov [rsi + 8], ecx
|
||||
mov [rsi + 12], edx
|
||||
|
||||
//
|
||||
// Wipe the chaining value on stack
|
||||
//
|
||||
xor rax, rax
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
|
||||
mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rax
|
||||
|
||||
SymCryptAesCtrMsb64NoData:
|
||||
|
||||
NESTED_FUNCTION_END(SymCryptAesCtrMsb64Asm)
|
||||
|
||||
FILE_END()
|
|
@ -1,529 +0,0 @@
|
|||
;
|
||||
; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
|
||||
;
|
||||
; This file contains alternative routines that are used for modular computations
|
||||
; where the modulus is 257-384 or 513-576 bits long.
|
||||
; (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
|
||||
;
|
||||
; The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
|
||||
;
|
||||
; Most of this code is a direct copy of the default code.
|
||||
; AMD64 digits are now 512 bits.
|
||||
; We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
|
||||
; are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
||||
include ksamd64.inc
|
||||
|
||||
include symcrypt_version.inc
|
||||
include symcrypt_magic.inc
|
||||
|
||||
|
||||
|
||||
include C_asm_shared.inc
|
||||
|
||||
; A digit consists of 4 words of 64 bits each
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdef369RawAddAsm(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
; UINT32 nDigits );
|
||||
|
||||
LEAF_ENTRY SymCryptFdef369RawAddAsm, _TEXT
|
||||
|
||||
; rcx = Src1
|
||||
; rdx = Src2
|
||||
; r8 = Dst
|
||||
; r9 = nDigits
|
||||
|
||||
add r9, 1
|
||||
xor rax, rax
|
||||
xor r10, r10
|
||||
|
||||
; Cy = 0
|
||||
|
||||
SymCryptFdef369RawAddAsmLoop:
|
||||
; carry is in the carry flag
|
||||
mov rax,[rcx]
|
||||
adc rax,[rdx]
|
||||
mov [r8],rax
|
||||
|
||||
mov rax,[rcx + 8]
|
||||
adc rax,[rdx + 8]
|
||||
mov [r8 + 8], rax
|
||||
|
||||
mov rax,[rcx + 16]
|
||||
adc rax,[rdx + 16]
|
||||
mov [r8 + 16], rax
|
||||
|
||||
lea rcx, [rcx + 24]
|
||||
lea rdx, [rdx + 24]
|
||||
lea r8, [r8 + 24]
|
||||
dec r9d
|
||||
jnz SymCryptFdef369RawAddAsmLoop
|
||||
|
||||
mov rax, r10
|
||||
adc rax, r10
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END SymCryptFdef369RawAddAsm, _TEXT
|
||||
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefRawSubAsm(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
; UINT32 nDigits );
|
||||
|
||||
LEAF_ENTRY SymCryptFdef369RawSubAsm, _TEXT
|
||||
|
||||
; rcx = Src1
|
||||
; rdx = Src2
|
||||
; r8 = Dst
|
||||
; r9 = nDigits
|
||||
|
||||
add r9, 1
|
||||
xor rax, rax
|
||||
xor r10, r10
|
||||
|
||||
SymCryptFdef369RawSubAsmLoop:
|
||||
; carry is in the carry flag
|
||||
mov rax,[rcx]
|
||||
sbb rax,[rdx]
|
||||
mov [r8],rax
|
||||
|
||||
mov rax,[rcx + 8]
|
||||
sbb rax,[rdx + 8]
|
||||
mov [r8 + 8], rax
|
||||
|
||||
mov rax,[rcx + 16]
|
||||
sbb rax,[rdx + 16]
|
||||
mov [r8 + 16], rax
|
||||
|
||||
lea rcx, [rcx + 24]
|
||||
lea rdx, [rdx + 24]
|
||||
lea r8, [r8 + 24]
|
||||
dec r9d
|
||||
jnz SymCryptFdef369RawSubAsmLoop
|
||||
|
||||
mov rax, r10
|
||||
adc rax, r10
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END SymCryptFdef369RawSubAsm, _TEXT
|
||||
|
||||
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefMaskedCopy(
|
||||
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
; UINT32 nDigits,
|
||||
; UINT32 mask )
|
||||
|
||||
LEAF_ENTRY SymCryptFdef369MaskedCopyAsm, _TEXT
|
||||
|
||||
add r8d, 1
|
||||
movsxd r9, r9d
|
||||
|
||||
SymCryptFdef369MaskedCopyAsmLoop:
|
||||
mov rax, [rcx]
|
||||
mov r10, [rdx]
|
||||
xor rax, r10
|
||||
and rax, r9
|
||||
xor rax, r10
|
||||
mov [rdx], rax
|
||||
|
||||
mov rax, [rcx + 8]
|
||||
mov r10, [rdx + 8]
|
||||
xor rax, r10
|
||||
and rax, r9
|
||||
xor rax, r10
|
||||
mov [rdx + 8], rax
|
||||
|
||||
mov rax, [rcx + 16]
|
||||
mov r10, [rdx + 16]
|
||||
xor rax, r10
|
||||
and rax, r9
|
||||
xor rax, r10
|
||||
mov [rdx + 16], rax
|
||||
|
||||
; Move on to the next digit
|
||||
|
||||
add rcx, 24
|
||||
add rdx, 24
|
||||
sub r8d, 1
|
||||
jnz SymCryptFdef369MaskedCopyAsmLoop
|
||||
ret
|
||||
|
||||
LEAF_END SymCryptFdef369MaskedCopyAsm, _TEXT
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefRawMul(
|
||||
; _In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
; UINT32 nDigits1,
|
||||
; _In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
; UINT32 nDigits2,
|
||||
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
|
||||
SymCryptFdef369RawMulAsm_Frame struct
|
||||
SavedRbx dq ?
|
||||
SavedRdi dq ?
|
||||
SavedRsi dq ?
|
||||
SavedR13 dq ?
|
||||
SavedR12 dq ?
|
||||
returnaddress dq ?
|
||||
Arg1Home dq ?
|
||||
Arg2Home dq ?
|
||||
Arg3Home dq ?
|
||||
Arg4Home dq ?
|
||||
pDst dq ?
|
||||
|
||||
SymCryptFdef369RawMulAsm_Frame ends
|
||||
|
||||
NESTED_ENTRY SymCryptFdef369RawMulAsm, _TEXT
|
||||
|
||||
rex_push_reg rbx
|
||||
push_reg r12
|
||||
push_reg r13
|
||||
push_reg rsi
|
||||
push_reg rdi
|
||||
|
||||
END_PROLOGUE
|
||||
|
||||
; Basic structure:
|
||||
; for each word in Src1:
|
||||
; Dst += Src2 * word
|
||||
; Register assignments
|
||||
;
|
||||
; rax = tmp for mul
|
||||
; rbx = word from Src1 to multiply with
|
||||
; rcx = pSrc1 (updated in outer loop)
|
||||
; rdx = tmp for mul
|
||||
; rsi = inner loop pointer into pSrc2
|
||||
; rdi = inner loop pointer into pDst
|
||||
; r8 = pSrc2
|
||||
; r9 = nDigits2
|
||||
; r10 = pDst (incremented in outer loop)
|
||||
; r11 = # words left from Src1 to process
|
||||
; r12 = carry
|
||||
; r13 = inner loop counter
|
||||
|
||||
|
||||
add edx, 1
|
||||
add r9d, 1
|
||||
lea r11d, [edx + 2*edx] ; nDigits1 * 3 = # words in Src1 to process
|
||||
mov r10, [rsp + SymCryptFdef369RawMulAsm_Frame.pDst ]
|
||||
|
||||
; Outer loop invariant established: rcx, r8, r9, r10
|
||||
|
||||
|
||||
mov rsi, r8 ; rsi = pSrc2
|
||||
mov rdi, r10 ; rdi = pDst + outer loop ctr
|
||||
mov rbx, [rcx] ; mulword
|
||||
xor r12, r12
|
||||
mov r13d, r9d
|
||||
|
||||
; First inner loop overwrites Dst, which avoids adding the current Dst value
|
||||
|
||||
SymCryptFdef369RawMulAsmLoop1:
|
||||
mov rax, [rsi]
|
||||
mul rbx
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [rsi + 8]
|
||||
mul rbx
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi + 8], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [rsi + 16]
|
||||
mul rbx
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi + 16], rax
|
||||
mov r12, rdx
|
||||
|
||||
add rsi, 24
|
||||
add rdi, 24
|
||||
sub r13d,1
|
||||
jnz SymCryptFdef369RawMulAsmLoop1
|
||||
|
||||
mov [rdi], rdx ; write last word, cannot overflow because Dst is at least 2 digits long
|
||||
|
||||
sub r11d, 1
|
||||
|
||||
SymCryptFdef369RawMulAsmLoopOuter:
|
||||
|
||||
add rcx, 8 ; move to next word of pSrc1
|
||||
add r10, 8 ; move Dst pointer one word over
|
||||
mov rbx, [rcx]
|
||||
mov rsi, r8
|
||||
mov rdi, r10
|
||||
xor r12, r12
|
||||
mov r13d, r9d
|
||||
|
||||
SymCryptFdef369RawMulAsmLoop2:
|
||||
mov rax, [rsi]
|
||||
mul rbx
|
||||
add rax, [rdi]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [rsi + 8]
|
||||
mul rbx
|
||||
add rax, [rdi + 8]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi + 8], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [rsi + 16]
|
||||
mul rbx
|
||||
add rax, [rdi + 16]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi + 16], rax
|
||||
mov r12, rdx
|
||||
|
||||
add rsi, 24
|
||||
add rdi, 24
|
||||
sub r13d,1
|
||||
jnz SymCryptFdef369RawMulAsmLoop2
|
||||
|
||||
mov [rdi], rdx ; write next word. (stays within Dst buffer)
|
||||
|
||||
sub r11d, 1
|
||||
jnz SymCryptFdef369RawMulAsmLoopOuter
|
||||
|
||||
BEGIN_EPILOGUE
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
NESTED_END SymCryptFdef369RawMulAsm, _TEXT
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
;VOID
|
||||
;SymCryptFdefMontgomeryReduceAsm(
|
||||
; _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
; _In_ PUINT32 pSrc,
|
||||
; _Out_ PUINT32 pDst )
|
||||
|
||||
NESTED_ENTRY SymCryptFdef369MontgomeryReduceAsm, _TEXT
|
||||
|
||||
rex_push_reg rbx
|
||||
push_reg r12
|
||||
push_reg r13
|
||||
push_reg r14
|
||||
push_reg rsi
|
||||
push_reg rdi
|
||||
push_reg rbp
|
||||
|
||||
END_PROLOGUE
|
||||
|
||||
mov r11, rdx ; r11 = pSrc
|
||||
mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits
|
||||
add ebp, 1
|
||||
mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64
|
||||
|
||||
lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value
|
||||
|
||||
lea edi, [ebp + 2*ebp] ; outer loop counter, in words
|
||||
|
||||
xor r14d, r14d
|
||||
|
||||
; General register allocations
|
||||
; rax = multiply result
|
||||
; rbx = multiplier in inner loop
|
||||
; rcx = pointer to modulus value
|
||||
; rdx = multiply result
|
||||
; rsi = loop counter
|
||||
; rdi = loop counter
|
||||
; rbp = nDigits
|
||||
; r8 = pDst
|
||||
; r9 = running pointer in Src
|
||||
; r10 = running pointer in Mod
|
||||
; r11 = pSrc (updated in outer loop)
|
||||
; r12 = carry
|
||||
; r13 = pmMod->tm.montgomery.inv64
|
||||
; r14 = carry out from last word of previous loop iteration
|
||||
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmOuterLoop:
|
||||
|
||||
; start decoder with a few simple instructions, including at least one that requires
|
||||
; a uop execution and is on the critical path
|
||||
|
||||
mov rbx, [r11] ; fetch word of Src we want to set to zero
|
||||
mov r10, r11
|
||||
mov r9, rcx
|
||||
|
||||
imul rbx, r13 ; lower word is same for signed & unsigned multiply
|
||||
|
||||
mov esi, ebp
|
||||
xor r12d, r12d
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmInnerloop:
|
||||
; rax = mul scratch
|
||||
; rbx = multiplier
|
||||
; rcx = pointer to modulus value
|
||||
; rdx = mul scratch
|
||||
; edi = outer loop counter (words)
|
||||
; esi = inner loop counter (digits)
|
||||
; r9 = running ptr to modulus
|
||||
; r10 = running ptr to input/scratch
|
||||
; r12 = carry (64 bits)
|
||||
|
||||
mov rax, [r9]
|
||||
mul rbx
|
||||
add rax, [r10]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [r10], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [r9 + 8]
|
||||
mul rbx
|
||||
add rax, [r10 + 8]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [r10 + 8], rax
|
||||
mov r12, rdx
|
||||
|
||||
mov rax, [r9 + 16]
|
||||
mul rbx
|
||||
add rax, [r10 + 16]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [r10 + 16], rax
|
||||
mov r12, rdx
|
||||
|
||||
add r9, 24
|
||||
add r10, 24
|
||||
sub esi,1
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmInnerloop
|
||||
|
||||
add r12, r14
|
||||
mov r14d, 0
|
||||
adc r14, 0
|
||||
add r12, [r10]
|
||||
adc r14, 0
|
||||
mov [r10], r12
|
||||
|
||||
add r11, 8
|
||||
|
||||
sub edi, 1
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop
|
||||
|
||||
;
|
||||
; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
|
||||
;
|
||||
|
||||
; First we compute the pSrc result minus the modulus into the destination
|
||||
mov esi, ebp ; loop ctr
|
||||
mov r10, r11 ; pSrc
|
||||
mov r9, rcx ; pMod
|
||||
mov r12, r8 ; pDst
|
||||
|
||||
; Cy = 0 because the last 'sub edi,1' resulted in 0
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmSubLoop:
|
||||
mov rax,[r10]
|
||||
sbb rax,[r9]
|
||||
mov [r12], rax
|
||||
|
||||
mov rax,[r10 + 8]
|
||||
sbb rax,[r9 + 8]
|
||||
mov [r12 + 8], rax
|
||||
|
||||
mov rax,[r10 + 16]
|
||||
sbb rax,[r9 + 16]
|
||||
mov [r12 + 16], rax
|
||||
|
||||
lea r10,[r10+24]
|
||||
lea r9, [r9 +24]
|
||||
lea r12,[r12+24]
|
||||
dec esi
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmSubLoop
|
||||
|
||||
; Finally a masked copy form pSrc to pDst
|
||||
; copy if: r14 == 0 && Cy = 1
|
||||
sbb r14, 0 ; mask (64 bits)
|
||||
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
|
||||
mov rax, [r11]
|
||||
mov rsi, [r8]
|
||||
xor rax, rsi
|
||||
and rax, r14
|
||||
xor rax, rsi
|
||||
mov [r8], rax
|
||||
|
||||
mov rax, [r11 + 8]
|
||||
mov rsi, [r8 + 8]
|
||||
xor rax, rsi
|
||||
and rax, r14
|
||||
xor rax, rsi
|
||||
mov [r8 + 8], rax
|
||||
|
||||
mov rax, [r11 + 16]
|
||||
mov rsi, [r8 + 16]
|
||||
xor rax, rsi
|
||||
and rax, r14
|
||||
xor rax, rsi
|
||||
mov [r8 + 16], rax
|
||||
|
||||
; Move on to the next digit
|
||||
|
||||
add r11, 24
|
||||
add r8, 24
|
||||
sub ebp, 1
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
|
||||
|
||||
BEGIN_EPILOGUE
|
||||
|
||||
pop rbp
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
NESTED_END SymCryptFdef369MontgomeryReduceAsm, _TEXT
|
||||
|
||||
end
|
||||
|
|
@ -0,0 +1,451 @@
|
|||
//
|
||||
// fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// This file contains alternative routines that are used for modular computations
|
||||
// where the modulus is 257-384 or 513-576 bits long.
|
||||
// (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
|
||||
//
|
||||
// The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
|
||||
//
|
||||
// Most of this code is a direct copy of the default code.
|
||||
// AMD64 digits are now 512 bits.
|
||||
// We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
|
||||
// are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
// A digit consists of 4 words of 64 bits each
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
// SymCryptFdef369RawAddAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
// UINT32 nDigits )
|
||||
FUNCTION_START(SymCryptFdef369RawAddAsm, 4, 5)
|
||||
|
||||
inc D4
|
||||
xor Q0, Q0
|
||||
|
||||
SymCryptFdef369RawAddAsmLoop:
|
||||
// carry is in the carry flag
|
||||
mov Q0,[Q1]
|
||||
adc Q0,[Q2]
|
||||
mov [Q3],Q0
|
||||
|
||||
mov Q0,[Q1 + 8]
|
||||
adc Q0,[Q2 + 8]
|
||||
mov [Q3 + 8], Q0
|
||||
|
||||
mov Q0,[Q1 + 16]
|
||||
adc Q0,[Q2 + 16]
|
||||
mov [Q3 + 16], Q0
|
||||
|
||||
lea Q1, [Q1 + 24]
|
||||
lea Q2, [Q2 + 24]
|
||||
lea Q3, [Q3 + 24]
|
||||
dec D4
|
||||
jnz SymCryptFdef369RawAddAsmLoop
|
||||
|
||||
mov Q0, 0
|
||||
adc Q0, Q0
|
||||
|
||||
FUNCTION_END(SymCryptFdef369RawAddAsm)
|
||||
|
||||
// UINT32
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptFdef369RawSubAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
// UINT32 nDigits )
|
||||
|
||||
FUNCTION_START(SymCryptFdef369RawSubAsm, 4, 5)
|
||||
|
||||
inc D4
|
||||
xor Q0, Q0
|
||||
|
||||
SymCryptFdef369RawSubAsmLoop:
|
||||
// carry is in the carry flag
|
||||
mov Q0,[Q1]
|
||||
sbb Q0,[Q2]
|
||||
mov [Q3],Q0
|
||||
|
||||
mov Q0,[Q1 + 8]
|
||||
sbb Q0,[Q2 + 8]
|
||||
mov [Q3 + 8], Q0
|
||||
|
||||
mov Q0,[Q1 + 16]
|
||||
sbb Q0,[Q2 + 16]
|
||||
mov [Q3 + 16], Q0
|
||||
|
||||
lea Q1, [Q1 + 24]
|
||||
lea Q2, [Q2 + 24]
|
||||
lea Q3, [Q3 + 24]
|
||||
dec D4
|
||||
jnz SymCryptFdef369RawSubAsmLoop
|
||||
|
||||
mov Q0, 0
|
||||
adc Q0, Q0
|
||||
|
||||
FUNCTION_END(SymCryptFdef369RawSubAsm)
|
||||
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptFdef369MaskedCopyAsm(
|
||||
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
// UINT32 nDigits,
|
||||
// UINT32 mask )
|
||||
|
||||
FUNCTION_START(SymCryptFdef369MaskedCopyAsm, 4, 6)
|
||||
|
||||
inc D3
|
||||
movsxd Q4, D4
|
||||
|
||||
SymCryptFdef369MaskedCopyAsmLoop:
|
||||
mov Q0, [Q1]
|
||||
mov Q5, [Q2]
|
||||
xor Q0, Q5
|
||||
and Q0, Q4
|
||||
xor Q0, Q5
|
||||
mov [Q2], Q0
|
||||
|
||||
mov Q0, [Q1 + 8]
|
||||
mov Q5, [Q2 + 8]
|
||||
xor Q0, Q5
|
||||
and Q0, Q4
|
||||
xor Q0, Q5
|
||||
mov [Q2 + 8], Q0
|
||||
|
||||
mov Q0, [Q1 + 16]
|
||||
mov Q5, [Q2 + 16]
|
||||
xor Q0, Q5
|
||||
and Q0, Q4
|
||||
xor Q0, Q5
|
||||
mov [Q2 + 16], Q0
|
||||
|
||||
// Move on to the next digit
|
||||
|
||||
add Q1, 24
|
||||
add Q2, 24
|
||||
dec D3
|
||||
jnz SymCryptFdef369MaskedCopyAsmLoop
|
||||
|
||||
FUNCTION_END(SymCryptFdef369MaskedCopyAsm)
|
||||
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptFdef369RawMulAsm(
|
||||
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
// UINT32 nDigits1,
|
||||
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
// UINT32 nDigits2,
|
||||
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
|
||||
MUL_FUNCTION_START(SymCryptFdef369RawMulAsm, 5, 11)
|
||||
|
||||
// Basic structure:
|
||||
// for each word in Src1:
|
||||
// Dst += Src2 * word
|
||||
// Register assignments
|
||||
//
|
||||
// Q0 = tmp for mul
|
||||
// QH = tmp for mul
|
||||
// Q1 = pSrc1 (updated in outer loop)
|
||||
// D2 = # words left from Src1 to process
|
||||
// Q3 = pSrc2
|
||||
// Q4 = nDigits2
|
||||
// Q5 = pDst (incremented in outer loop)
|
||||
// Q6 = inner loop pointer into pSrc2
|
||||
// Q7 = inner loop pointer into pDst
|
||||
// Q8 = word from Src1 to multiply with
|
||||
// Q9 = carry
|
||||
// D10 = inner loop counter
|
||||
|
||||
inc D2
|
||||
inc D4
|
||||
lea D2, [D2 + 2*D2] // nDigits1 * 3 = # words in Src1 to process
|
||||
|
||||
// Outer loop invariant established: Q1, Q3, D4, Q5
|
||||
|
||||
mov Q6, Q3 // Q6 = pSrc2
|
||||
mov Q7, Q5 // Q7 = pDst + outer loop ctr
|
||||
mov Q8, [Q1] // mulword
|
||||
xor Q9, Q9
|
||||
mov D10, D4
|
||||
|
||||
// First inner loop overwrites Dst, which avoids adding the current Dst value
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369RawMulAsmLoop1:
|
||||
mov Q0, [Q6]
|
||||
mul Q8
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7], Q0
|
||||
mov Q9, QH
|
||||
|
||||
mov Q0, [Q6 + 8]
|
||||
mul Q8
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7 + 8], Q0
|
||||
mov Q9, QH
|
||||
|
||||
mov Q0, [Q6 + 16]
|
||||
mul Q8
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7 + 16], Q0
|
||||
mov Q9, QH
|
||||
|
||||
add Q6, 24
|
||||
add Q7, 24
|
||||
dec D10
|
||||
jnz SymCryptFdef369RawMulAsmLoop1
|
||||
|
||||
mov [Q7], QH // write last word, cannot overflow because Dst is at least 2 digits long
|
||||
|
||||
dec D2
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369RawMulAsmLoopOuter:
|
||||
|
||||
add Q1, 8 // move to next word of pSrc1
|
||||
add Q5, 8 // move Dst pointer one word over
|
||||
mov Q8, [Q1]
|
||||
mov Q6, Q3
|
||||
mov Q7, Q5
|
||||
xor Q9, Q9
|
||||
mov D10, D4
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369RawMulAsmLoop2:
|
||||
mov Q0, [Q6]
|
||||
mul Q8
|
||||
add Q0, [Q7]
|
||||
adc QH, 0
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7], Q0
|
||||
mov Q9, QH
|
||||
|
||||
mov Q0, [Q6 + 8]
|
||||
mul Q8
|
||||
add Q0, [Q7 + 8]
|
||||
adc QH, 0
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7 + 8], Q0
|
||||
mov Q9, QH
|
||||
|
||||
mov Q0, [Q6 + 16]
|
||||
mul Q8
|
||||
add Q0, [Q7 + 16]
|
||||
adc QH, 0
|
||||
add Q0, Q9
|
||||
adc QH, 0
|
||||
mov [Q7 + 16], Q0
|
||||
mov Q9, QH
|
||||
|
||||
add Q6, 24
|
||||
add Q7, 24
|
||||
dec D10
|
||||
jnz SymCryptFdef369RawMulAsmLoop2
|
||||
|
||||
mov [Q7], QH // write next word. (stays within Dst buffer)
|
||||
|
||||
dec D2
|
||||
jnz SymCryptFdef369RawMulAsmLoopOuter
|
||||
|
||||
MUL_FUNCTION_END(SymCryptFdef369RawMulAsm)
|
||||
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptFdef369MontgomeryReduceAsm(
|
||||
// _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
// _Inout_ PUINT32 pSrc,
|
||||
// _Out_ PUINT32 pDst )
|
||||
|
||||
MUL_FUNCTION_START(SymCryptFdef369MontgomeryReduceAsm, 3, 13)
|
||||
|
||||
mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits
|
||||
inc D4
|
||||
mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64
|
||||
|
||||
lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value
|
||||
|
||||
lea D12, [D4 + 2*D4] // outer loop counter, in words
|
||||
|
||||
xor D8, D8
|
||||
|
||||
// General register allocations
|
||||
// Q0 = multiply result
|
||||
// QH = multiply result
|
||||
// Q1 = pointer to modulus value
|
||||
// Q2 = pSrc (updated in outer loop)
|
||||
// Q3 = pDst
|
||||
// D4 = nDigits
|
||||
// Q5 = pmMod->tm.montgomery.inv64
|
||||
// Q6 = multiplier in inner loop
|
||||
// Q7 = carry
|
||||
// Q8 = carry out from last word of previous loop iteration
|
||||
// Q9 = running pointer in Src
|
||||
// Q10 = running pointer in Mod
|
||||
// D11 = loop counter
|
||||
// D12 = outer loop counter (words)
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmOuterLoop:
|
||||
|
||||
// start decoder with a few simple instructions, including at least one that requires
|
||||
// a uop execution and is on the critical path
|
||||
|
||||
mov Q6, [Q2] // fetch word of Src we want to set to zero
|
||||
mov Q10, Q2
|
||||
mov Q9, Q1
|
||||
|
||||
imul Q6, Q5 // lower word is same for signed & unsigned multiply
|
||||
|
||||
mov D11, D4
|
||||
xor D7, D7
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmInnerloop:
|
||||
// Q0 = mul scratch
|
||||
// QH = mul scratch
|
||||
// Q1 = pointer to modulus value
|
||||
// Q6 = multiplier
|
||||
// Q7 = carry (64 bits)
|
||||
// Q9 = running ptr to modulus
|
||||
// Q10 = running ptr to input/scratch
|
||||
// D11 = inner loop counter (digits)
|
||||
// D12 = outer loop counter (words)
|
||||
|
||||
mov Q0, [Q9]
|
||||
mul Q6
|
||||
add Q0, [Q10]
|
||||
adc QH, 0
|
||||
add Q0, Q7
|
||||
adc QH, 0
|
||||
mov [Q10], Q0
|
||||
mov Q7, QH
|
||||
|
||||
mov Q0, [Q9 + 8]
|
||||
mul Q6
|
||||
add Q0, [Q10 + 8]
|
||||
adc QH, 0
|
||||
add Q0, Q7
|
||||
adc QH, 0
|
||||
mov [Q10 + 8], Q0
|
||||
mov Q7, QH
|
||||
|
||||
mov Q0, [Q9 + 16]
|
||||
mul Q6
|
||||
add Q0, [Q10 + 16]
|
||||
adc QH, 0
|
||||
add Q0, Q7
|
||||
adc QH, 0
|
||||
mov [Q10 + 16], Q0
|
||||
mov Q7, QH
|
||||
|
||||
add Q9, 24
|
||||
add Q10, 24
|
||||
dec D11
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmInnerloop
|
||||
|
||||
add Q7, Q8
|
||||
mov D8, 0
|
||||
adc Q8, 0
|
||||
add Q7, [Q10]
|
||||
adc Q8, 0
|
||||
mov [Q10], Q7
|
||||
|
||||
add Q2, 8
|
||||
|
||||
dec D12
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop
|
||||
|
||||
//
|
||||
// Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
|
||||
//
|
||||
|
||||
// First we compute the pSrc result minus the modulus into the destination
|
||||
mov D11, D4 // loop ctr
|
||||
mov Q10, Q2 // pSrc
|
||||
mov Q9, Q1 // pMod
|
||||
mov Q7, Q3 // pDst
|
||||
|
||||
// Cy = 0 because the last 'adc Q8,0' resulted in 0, 1, or 2
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmSubLoop:
|
||||
mov Q0,[Q10]
|
||||
sbb Q0,[Q9]
|
||||
mov [Q7], Q0
|
||||
|
||||
mov Q0,[Q10 + 8]
|
||||
sbb Q0,[Q9 + 8]
|
||||
mov [Q7 + 8], Q0
|
||||
|
||||
mov Q0,[Q10 + 16]
|
||||
sbb Q0,[Q9 + 16]
|
||||
mov [Q7 + 16], Q0
|
||||
|
||||
lea Q10,[Q10 + 24]
|
||||
lea Q9,[Q9 + 24]
|
||||
lea Q7,[Q7 + 24]
|
||||
|
||||
dec D11
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmSubLoop
|
||||
|
||||
// Finally a masked copy form pSrc to pDst
|
||||
// copy if: Q8 == 0 && Cy = 1
|
||||
sbb Q8, 0 // mask (64 bits)
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
|
||||
mov Q0, [Q2]
|
||||
mov Q1, [Q3]
|
||||
xor Q0, Q1
|
||||
and Q0, Q8
|
||||
xor Q0, Q1
|
||||
mov [Q3], Q0
|
||||
|
||||
mov Q0, [Q2 + 8]
|
||||
mov Q1, [Q3 + 8]
|
||||
xor Q0, Q1
|
||||
and Q0, Q8
|
||||
xor Q0, Q1
|
||||
mov [Q3 + 8], Q0
|
||||
|
||||
mov Q0, [Q2 + 16]
|
||||
mov Q1, [Q3 + 16]
|
||||
xor Q0, Q1
|
||||
and Q0, Q8
|
||||
xor Q0, Q1
|
||||
mov [Q3 + 16], Q0
|
||||
|
||||
// Move on to the next digit
|
||||
|
||||
add Q2, 24
|
||||
add Q3, 24
|
||||
dec D4
|
||||
jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
|
||||
|
||||
MUL_FUNCTION_END(SymCryptFdef369MontgomeryReduceAsm)
|
||||
|
||||
FILE_END()
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,224 +0,0 @@
|
|||
;
|
||||
; Macros for the multiplication routines in amd64
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
||||
; General multiplication
|
||||
|
||||
MULT_SINGLEADD_128 MACRO index, src_reg, dst_reg
|
||||
; rax = mul scratch
|
||||
; rbx = multiplier
|
||||
; rdx = mul scratch
|
||||
; src_reg = running ptr to input
|
||||
; dst_reg = running ptr to output/scratch
|
||||
; r12 = carry for even words (64 bits)
|
||||
; r15 = carry for odd words (64 bits)
|
||||
|
||||
mov rax, [src_reg + 8*index]
|
||||
mul rbx
|
||||
mov r15, rdx
|
||||
add rax, r12
|
||||
mov [dst_reg + 8*index], rax
|
||||
adc r15, 0
|
||||
|
||||
mov rax, [src_reg + 8*(index+1)]
|
||||
mul rbx
|
||||
mov r12, rdx
|
||||
add rax, r15
|
||||
mov [dst_reg + 8*(index+1)], rax
|
||||
adc r12, 0
|
||||
|
||||
ENDM
|
||||
|
||||
MULT_DOUBLEADD_128 MACRO index, src_reg, dst_reg
|
||||
; rax = mul scratch
|
||||
; rbx = multiplier
|
||||
; rdx = mul scratch
|
||||
; src_reg = running ptr to input
|
||||
; dst_reg = running ptr to output/scratch
|
||||
; r12 = carry for even words (64 bits)
|
||||
; r15 = carry for odd words (64 bits)
|
||||
|
||||
mov rax, [src_reg + 8*index]
|
||||
mul rbx
|
||||
mov r15, rdx
|
||||
add rax, [dst_reg + 8*index]
|
||||
adc r15, 0
|
||||
add rax, r12
|
||||
mov [dst_reg + 8*index], rax
|
||||
adc r15, 0
|
||||
|
||||
mov rax, [src_reg + 8*(index+1)]
|
||||
mul rbx
|
||||
mov r12, rdx
|
||||
add rax, [dst_reg + 8*(index+1)]
|
||||
adc r12, 0
|
||||
add rax, r15
|
||||
mov [dst_reg + 8*(index+1)], rax
|
||||
adc r12, 0
|
||||
|
||||
ENDM
|
||||
|
||||
; Squaring
|
||||
|
||||
SQR_SINGLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry
|
||||
; rax = mul scratch
|
||||
; rbx = multiplier
|
||||
; rdx = mul scratch
|
||||
; src_reg = running ptr to input
|
||||
; dst_reg = running ptr to output/scratch
|
||||
; src_carry = input carry
|
||||
; dst_carry = output carry
|
||||
|
||||
mov rax, [src_reg + 8*index]
|
||||
mul rbx
|
||||
mov dst_carry, rdx
|
||||
add rax, src_carry
|
||||
mov [dst_reg + 8*index], rax
|
||||
adc dst_carry, 0
|
||||
|
||||
ENDM
|
||||
|
||||
SQR_DOUBLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry
|
||||
; rax = mul scratch
|
||||
; rbx = multiplier
|
||||
; rdx = mul scratch
|
||||
; src_reg = running ptr to input
|
||||
; dst_reg = running ptr to output/scratch
|
||||
; src_carry = input carry
|
||||
; dst_carry = output carry
|
||||
|
||||
mov rax, [src_reg + 8*index]
|
||||
mul rbx
|
||||
mov dst_carry, rdx
|
||||
add rax, [dst_reg + 8*index]
|
||||
adc dst_carry, 0
|
||||
add rax, src_carry
|
||||
mov [dst_reg + 8*index], rax
|
||||
adc dst_carry, 0
|
||||
|
||||
ENDM
|
||||
|
||||
SQR_SHIFT_LEFT MACRO index
|
||||
mov rax, [rdi + 8*index]
|
||||
adc rax, rax ; Shift let and add the carry
|
||||
mov [rdi + 8*index], rax
|
||||
ENDM
|
||||
|
||||
SQR_DIAGONAL_PROP MACRO index
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Calculating the square
|
||||
mov rax, [rsi + 8*index] ; mulword
|
||||
mul rax ; m^2
|
||||
|
||||
; Adding the square to the even column
|
||||
add rax, [rdi + 16*index]
|
||||
adc rdx, 0
|
||||
add rax, r12
|
||||
adc rdx, 0
|
||||
mov [rdi + 16*index], rax
|
||||
|
||||
; Propagating the sum to the next column
|
||||
mov rax, rdx
|
||||
xor rdx, rdx
|
||||
|
||||
add rax, [rdi + 16*index + 8]
|
||||
adc rdx, 0
|
||||
mov [rdi + 16*index + 8], rax
|
||||
mov r12, rdx
|
||||
ENDM
|
||||
|
||||
; Size-specific macros
|
||||
; A common prologue & epilogue between several functions allows jumping between them...
|
||||
|
||||
MULT_COMMON_PROLOGUE MACRO
|
||||
; We need all the registers
|
||||
push_reg r12
|
||||
push_reg r13
|
||||
push_reg r14
|
||||
push_reg r15
|
||||
push_reg rdi
|
||||
push_reg rsi
|
||||
push_reg rbx
|
||||
push_reg rbp
|
||||
|
||||
END_PROLOGUE
|
||||
ENDM
|
||||
|
||||
MULT_COMMON_EPILOGUE MACRO
|
||||
BEGIN_EPILOGUE
|
||||
|
||||
pop rbp
|
||||
pop rbx
|
||||
pop rsi
|
||||
pop rdi
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
ret
|
||||
ENDM
|
||||
|
||||
|
||||
MUL14 MACRO Mult, pA, R0, R1, R2, R3, Cy
|
||||
; (R0, R1, R2, R3, rdx) = Mult * (A0..3) + (R0, R1, R2, R3)
|
||||
; Cy, rax = scratch
|
||||
|
||||
mov rax, [pA]
|
||||
mul Mult
|
||||
add R0, rax
|
||||
adc rdx, 0
|
||||
mov Cy, rdx
|
||||
|
||||
mov rax, [pA + 8]
|
||||
mul Mult
|
||||
add R1, rax
|
||||
adc rdx, 0
|
||||
add R1, Cy
|
||||
adc rdx, 0
|
||||
mov Cy, rdx
|
||||
|
||||
mov rax, [pA + 16]
|
||||
mul Mult
|
||||
add R2, rax
|
||||
adc rdx, 0
|
||||
add R2, Cy
|
||||
adc rdx, 0
|
||||
mov Cy, rdx
|
||||
|
||||
mov rax, [pA + 24]
|
||||
mul Mult
|
||||
add R3, rax
|
||||
adc rdx, 0
|
||||
add R3, Cy
|
||||
adc rdx, 0
|
||||
|
||||
ENDM
|
||||
|
||||
; Macros for size-specific squaring
|
||||
|
||||
SQR_DOUBLEADD_64_2 MACRO index
|
||||
SQR_DOUBLEADD_64 (index), rsi, rdi, r12, r15
|
||||
SQR_DOUBLEADD_64 (index + 1), rsi, rdi, r15, r12
|
||||
ENDM
|
||||
|
||||
SQR_DOUBLEADD_64_4 MACRO index
|
||||
SQR_DOUBLEADD_64_2 (index)
|
||||
SQR_DOUBLEADD_64_2 (index + 2)
|
||||
ENDM
|
||||
|
||||
SQR_DOUBLEADD_64_8 MACRO index
|
||||
SQR_DOUBLEADD_64_4 (index)
|
||||
SQR_DOUBLEADD_64_4 (index + 4)
|
||||
ENDM
|
||||
|
||||
SQR_SIZE_SPECIFIC_INIT MACRO
|
||||
lea rcx, [rcx + 8] ; move Src pointer 1 word over
|
||||
lea r10, [r10 + 16] ; move Dst pointer 2 words over
|
||||
|
||||
mov rsi, rcx ; rsi = inner pSrc
|
||||
mov rdi, r10 ; rdi = inner pDst
|
||||
|
||||
mov rbx, [rcx] ; Get the next mulword
|
||||
lea rsi, [rsi + 8] ; move Src pointer 1 word over
|
||||
ENDM
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,423 +0,0 @@
|
|||
;
|
||||
; Sha1Asm.Asm
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
;
|
||||
|
||||
;
|
||||
; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
|
||||
; for the x64 processor architecture.
|
||||
;
|
||||
; This implementation is derived from the 32-bit one, which in turn is derived
|
||||
; from an older one by Scott Field and Dan Shumow.
|
||||
;
|
||||
|
||||
include ksamd64.inc
|
||||
|
||||
TITLE sha1asm.asm
|
||||
|
||||
;
|
||||
; The four round constants used by SHA-1
|
||||
;
|
||||
|
||||
K0_19 EQU 05a827999H
|
||||
K20_39 EQU 06ed9eba1H
|
||||
K40_59 EQU 08f1bbcdcH
|
||||
K60_79 EQU 0ca62c1d6H
|
||||
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H,
|
||||
; _In_reads_bytes_( cbData ) PCBYTE pbData,
|
||||
; SIZE_T cbData )
|
||||
;
|
||||
|
||||
;
|
||||
; This function allocates stack space, so it is not a LEAF function
|
||||
; but a nested one.
|
||||
;
|
||||
NESTED_ENTRY SymCryptSha1AppendBlocksAsm, _TEXT
|
||||
|
||||
;
|
||||
; To keep stack manipulations simple we define a structure and use that for all accesses.
|
||||
;
|
||||
|
||||
SymCryptSha1AppendBlocksFrame struct 16, NONUNIQUE
|
||||
;
|
||||
; To keep the RSP aligned we need (8 mod 16) bytes of local stack space.
|
||||
; this is the case, so there is no need for a dummy location
|
||||
;
|
||||
Wbuf dd 16 dup (?)
|
||||
EndAddress dq ?
|
||||
SaveR12 dq ?
|
||||
SaveR13 dq ?
|
||||
SaveR14 dq ?
|
||||
SaveR15 dq ?
|
||||
SaveRdi dq ?
|
||||
SaveRsi dq ?
|
||||
SaveRbp dq ?
|
||||
SaveRbx dq ?
|
||||
ReturnAddress dq ?
|
||||
CallerP1Home dq ?
|
||||
CallerP2Home dq ?
|
||||
CallerP3Home dq ?
|
||||
CallerP4Home dq ?
|
||||
|
||||
SymCryptSha1AppendBlocksFrame ends
|
||||
|
||||
;
|
||||
; We use the W buffer extensively; this is a shorthand for the base address
|
||||
;
|
||||
W equ rsp+SymCryptSha1AppendBlocksFrame.Wbuf
|
||||
|
||||
|
||||
|
||||
;
|
||||
; Set up our stack frame and save non-volatile registers
|
||||
;
|
||||
rex_push_reg rbx
|
||||
push_reg rbp
|
||||
push_reg rsi
|
||||
push_reg rdi
|
||||
push_reg r15
|
||||
push_reg r14
|
||||
push_reg r13
|
||||
push_reg r12
|
||||
alloc_stack SymCryptSha1AppendBlocksFrame.SaveR12
|
||||
|
||||
END_PROLOGUE
|
||||
|
||||
;
|
||||
;Register allocation:
|
||||
;
|
||||
;5 registers for state
|
||||
;2 scratch
|
||||
;6 registers for W[t-1], W[t-2], W[t-3], W[t-14], W[t-15], W[t-16]
|
||||
;1 for data pointer
|
||||
;1 for H pointer
|
||||
;
|
||||
;
|
||||
; To allow macro re-ordering of our registers we use symbolic names
|
||||
; for the registers.
|
||||
; s0-s4 are the 5 state registers. x1 and x2 are extra scratch registers.
|
||||
; w0-w5 contain the W state cache
|
||||
;
|
||||
; Note: some other code puts the right value in the right register and
|
||||
; has to be updated if this mapping is changed.
|
||||
;
|
||||
; a is in register (round % 5)
|
||||
; b is in register (round+4 % 5)
|
||||
; c is in register (round+3 % 5)
|
||||
; d is in register (round+2 % 5)
|
||||
; e is in register (round+1 % 5)
|
||||
; This way, if round is incremented we move a->b, b->c, c->d, d->e, and e->a
|
||||
; For optimization the actual value of a is in scratch register x1 at the start of each round
|
||||
;
|
||||
; W[t- 1] is in register (round % 6)
|
||||
; W[t- 2] is in register (round+5 % 6)
|
||||
; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
|
||||
; W[t-14] is in register (round+3 % 6)
|
||||
; W[t-15] is in register (round+2 % 6)
|
||||
; W[t-16] is in register (round+1 % 6)
|
||||
; If round is incremented the values all appear in their right place.
|
||||
|
||||
s0 EQU eax
|
||||
s1 EQU ebx
|
||||
s2 EQU ecx
|
||||
s3 EQU edx
|
||||
s4 EQU esi
|
||||
|
||||
w0 EQU r9d
|
||||
w1 EQU r10d
|
||||
w2 EQU r11d
|
||||
w3 EQU r12d
|
||||
w4 EQU r13d
|
||||
w5 EQU r14d
|
||||
|
||||
x1 EQU ebp ; screatch 1
|
||||
x2 EQU edi ; scratch 2
|
||||
|
||||
dataPtr EQU r8 ; Points to data buffer
|
||||
HPtr EQU r15 ; Points to H
|
||||
|
||||
|
||||
; At this point:
|
||||
; rcx = H
|
||||
; rdx = pbData
|
||||
; r8 = cbData
|
||||
;
|
||||
; compute the end address, address of byte after last block we will process
|
||||
; This code ensures that we never exceed the data buffer we were given,
|
||||
; although we silently round the cbData parameter down to the next
|
||||
; multiple of 64.
|
||||
; Do nothing if no blocks need to be processed.
|
||||
;
|
||||
and r8,NOT 3fh ; round down to multiple of 64
|
||||
jz SymCryptSha1AppendBlocksDone
|
||||
add r8,rdx ; pbData + (cbData & 0x3f)
|
||||
mov [rsp+SymCryptSha1AppendBlocksFrame.EndAddress], r8
|
||||
|
||||
mov dataPtr,rdx
|
||||
mov Hptr,rcx
|
||||
|
||||
;
|
||||
; Load the H state, note that the a value lives in x1 at the round code boundary
|
||||
;
|
||||
mov x1,[Hptr ]
|
||||
mov s4,[Hptr+ 4]
|
||||
mov s3,[Hptr+ 8]
|
||||
mov s2,[Hptr+12]
|
||||
mov s1,[Hptr+16]
|
||||
|
||||
|
||||
SymCryptSha1AppendBlocksLoop:
|
||||
;
|
||||
; This is the main loop. We process 64 bytes in each iteration.
|
||||
;
|
||||
; Most of the code in the loop is generated through macros using parameters to
|
||||
; rename the registers.
|
||||
;
|
||||
|
||||
ROUND_CH_0_15 MACRO round,sa,sb,sc,sd,se,wt,x1,x2
|
||||
;
|
||||
; Code for round 0-15.
|
||||
; This code loads data from the data buffer & BSWAPs the data to get it into the
|
||||
; right form.
|
||||
;
|
||||
; Parameters:
|
||||
; round round number
|
||||
; sa register that will contain the a value
|
||||
; sb register that contains the b value
|
||||
; sc register that contains the c value
|
||||
; sd register that contains the d value
|
||||
; se register that contains the e value
|
||||
; x1 scratch, contains the a value on entry
|
||||
; x2 scratch register.
|
||||
; wt register loaded with Wt
|
||||
;
|
||||
; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
|
||||
; We start with the d value as that is the oldest value and available the first
|
||||
;
|
||||
; See FIPS 180-2 for our symbolic notation.
|
||||
;
|
||||
mov x2,sd ; x2 = d
|
||||
mov wt,[dataPtr+4*round] ; Fetch word from message
|
||||
mov sa, x1 ; put a in the correct register
|
||||
|
||||
bswap wt ; wt = Wt
|
||||
xor x2,sc ; x2 = (d ^ c)
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
|
||||
add se,wt ; se = e + Wt
|
||||
and x2,sb ; x2 = ((d ^ c) & b)
|
||||
mov [W + 4*round],wt ; Store in W buffer for future use
|
||||
ror sb,2 ; sb = ROL( b, 30 )
|
||||
|
||||
add se,x1 ; se = e + Wt + ROL(a,5)
|
||||
xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
|
||||
|
||||
lea x1,[se+x2+K0_19] ; x1 = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
|
||||
|
||||
ENDM
|
||||
|
||||
MSG_EXP MACRO round, se, wa, wb, wc
|
||||
; round round number
|
||||
; se register of state to add expanded message word to
|
||||
; wa register of W[round-16], will be updated to contain W[round]
|
||||
; wb register of W[round-14]
|
||||
; wc register of W[round- 3], will be loaded with W[round-13]
|
||||
|
||||
xor wc, wb ; wc = W[t-3] ^ W[t-14]
|
||||
xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8]
|
||||
xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
|
||||
rol wa,1 ; wa = Wt
|
||||
IF round LT (80 - 1)
|
||||
; do not load wc with W[t-13] in the last round; it will not be needed
|
||||
mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13]
|
||||
ENDIF
|
||||
add se,wa ; re = e + Wt
|
||||
IF round LT (80 - 8)
|
||||
; don't store Wt in the last 8 rounds. The value would never be used
|
||||
mov [W+4*(round MOD 16)], wa; Store Wt
|
||||
ENDIF
|
||||
ENDM
|
||||
|
||||
ROUND_CH MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
|
||||
;
|
||||
; See ROUND_CH_0_15 for most parameters.
|
||||
; x1 and x2 are both scratch registers
|
||||
; wa register of W[round-16], will be updated to contain W[round]
|
||||
; wb register of W[round-14]
|
||||
; wc register of W[round- 3], will be loaded with W[round-13]
|
||||
;
|
||||
|
||||
xor wc, wb ; wc = W[t-3] ^ W[t-14]
|
||||
xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8]
|
||||
xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
|
||||
rol wa,1 ; wa = Wt
|
||||
mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13]
|
||||
add se,wa ; re = e + Wt
|
||||
mov [W+4*(round MOD 16)], wa ; Store Wt
|
||||
|
||||
mov sa, x1 ; put a in the correct register
|
||||
mov x2,sd ; x2 = d
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
xor x2,sc ; x2 = (d ^ c)
|
||||
add se,x1 ; re = e + Wt + ROL(a,5)
|
||||
and x2,sb ; x2 = ((d ^ c) & b)
|
||||
ror sb,2 ; rb = ROL( b, 30 )
|
||||
xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
|
||||
lea x1,[se+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
|
||||
ENDM
|
||||
|
||||
ROUND_PARITY MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2, K
|
||||
;
|
||||
; See ROUND_CH for most parameters
|
||||
; K is the round constant to use.
|
||||
;
|
||||
; The order of xorring the registers b, c, and d is driven by the data dependency graph.
|
||||
; We start with d (the oldest) and then do b to unblock the subsequent rotate
|
||||
;
|
||||
MSG_EXP round, se, wa, wb, wc ; re = e + Wt
|
||||
|
||||
mov sa,x1 ; store a value in right register
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
add se,x1 ; re = e + Wt + ROL(a,5)
|
||||
|
||||
mov x2,sd ; x1 = d
|
||||
xor x2,sb ; x1 = (d ^ b)
|
||||
xor x2,sc ; x1 = (d ^ b ^ c) = Parity(b,c,d)
|
||||
ror sb,2 ; rb = ROL( b, 30 )
|
||||
lea x1,[se+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
|
||||
|
||||
ENDM
|
||||
|
||||
ROUND_MAJ MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
|
||||
;
|
||||
; See above for parameter explanation
|
||||
;
|
||||
MSG_EXP round, se, wa, wb, wc ; re = e + Wt
|
||||
|
||||
mov sa,x1 ; store a value in right register
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
add se,x1 ; re = e + ROL(a,5)
|
||||
mov x1,sd ; x1 = d
|
||||
or x1,sc ; x1 = (d | c)
|
||||
and x1,sb ; x1 = ((d | c) & b)
|
||||
|
||||
mov x2,sc ; x2 = c
|
||||
and x2,sd ; x2 = (c & d)
|
||||
or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
|
||||
|
||||
ror sb,2 ; rb = ROL( b, 30 )
|
||||
|
||||
lea x1,[se+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
|
||||
ENDM
|
||||
|
||||
|
||||
;
|
||||
; With these macros we can now produce the actual code.
|
||||
; Note the use of the % operator which evaluates the expression and yields the result as text.
|
||||
; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
|
||||
; for each round.
|
||||
;
|
||||
; The first 16 rounds are more complicated as we need to use the right registers to load the msg in
|
||||
; so we do those by hand
|
||||
;
|
||||
; W[t- 1] is in register (round % 6)
|
||||
; W[t- 2] is in register (round+5 % 6)
|
||||
; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
|
||||
; W[t-14] is in register (round+3 % 6)
|
||||
; W[t-15] is in register (round+2 % 6)
|
||||
; W[t-16] is in register (round+1 % 6)
|
||||
;
|
||||
ROUND_CH_0_15 0, s0, s4, s3, s2, s1, w5, x1, x2 ;W[t-16] for t=16 is in w5
|
||||
ROUND_CH_0_15 1, s1, s0, s4, s3, s2, w0, x1, x2 ;W[t-15] for t=16 is in w0
|
||||
ROUND_CH_0_15 2, s2, s1, s0, s4, s3, w1, x1, x2 ;W[t-14] for t=16 is in w1
|
||||
ROUND_CH_0_15 3, s3, s2, s1, s0, s4, w3, x1, x2 ;
|
||||
ROUND_CH_0_15 4, s4, s3, s2, s1, s0, w4, x1, x2 ;
|
||||
ROUND_CH_0_15 5, s0, s4, s3, s2, s1, w3, x1, x2 ;
|
||||
ROUND_CH_0_15 6, s1, s0, s4, s3, s2, w4, x1, x2 ;
|
||||
ROUND_CH_0_15 7, s2, s1, s0, s4, s3, w3, x1, x2 ;
|
||||
ROUND_CH_0_15 8, s3, s2, s1, s0, s4, w4, x1, x2 ;
|
||||
ROUND_CH_0_15 9, s4, s3, s2, s1, s0, w3, x1, x2 ;
|
||||
ROUND_CH_0_15 10, s0, s4, s3, s2, s1, w4, x1, x2 ;
|
||||
ROUND_CH_0_15 11, s1, s0, s4, s3, s2, w3, x1, x2 ;
|
||||
ROUND_CH_0_15 12, s2, s1, s0, s4, s3, w4, x1, x2 ;
|
||||
ROUND_CH_0_15 13, s3, s2, s1, s0, s4, w2, x1, x2 ;W[t-3] for t=16 is in w2
|
||||
ROUND_CH_0_15 14, s4, s3, s2, s1, s0, w3, x1, x2 ;W[t-2] for t=16 is in w3
|
||||
ROUND_CH_0_15 15, s0, s4, s3, s2, s1, w4, x1, x2 ;W[t-1] for t=16 is in w4
|
||||
|
||||
|
||||
FOR t, <16, 17, 18, 19>
|
||||
ROUND_CH t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
|
||||
ENDM
|
||||
|
||||
FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
|
||||
ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K20_39
|
||||
ENDM
|
||||
|
||||
FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
|
||||
ROUND_MAJ t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
|
||||
ENDM
|
||||
|
||||
FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79>
|
||||
ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K60_79
|
||||
ENDM
|
||||
|
||||
;
|
||||
; Now we update the state, & the dataPtr
|
||||
;
|
||||
add x1,[Hptr ]
|
||||
add s4,[Hptr+ 4]
|
||||
add dataPtr,64
|
||||
add s3,[Hptr+ 8]
|
||||
add s2,[Hptr+12]
|
||||
add s1,[Hptr+16]
|
||||
|
||||
mov [Hptr ], x1
|
||||
mov [Hptr+ 4], s4
|
||||
cmp dataPtr,[rsp+SymCryptSha1AppendBlocksFrame.EndAddress] ; Loop terminating condition
|
||||
mov [Hptr+ 8], s3
|
||||
mov [Hptr+12], s2
|
||||
mov [Hptr+16], s1
|
||||
|
||||
jc SymCryptSha1AppendBlocksLoop ; Main loop
|
||||
|
||||
;
|
||||
; We're done processing the blocks. The result is already in the state, so all we have to do
|
||||
; is clean up.
|
||||
;
|
||||
; Wipe the W buffer
|
||||
; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
|
||||
;
|
||||
mov rcx,64
|
||||
xor rax,rax
|
||||
@@: sub ecx,16
|
||||
mov [rsp+rcx ],rax
|
||||
mov [rsp+rcx+8],rax
|
||||
jnz @B
|
||||
|
||||
SymCryptSha1AppendBlocksDone:
|
||||
|
||||
|
||||
add rsp, SymCryptSha1AppendBlocksFrame.SaveR12
|
||||
|
||||
BEGIN_EPILOGUE
|
||||
pop r12
|
||||
pop r13
|
||||
pop r14
|
||||
pop r15
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
|
||||
ret
|
||||
|
||||
NESTED_END SymCryptSha1AppendBlocksAsm, _TEXT
|
||||
|
||||
END
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
;
|
||||
; SymCrypt_magic.inc
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
; Include file to define the support macros for the Magic field
|
||||
;
|
||||
|
||||
extern SymCryptFatal:NEAR
|
||||
|
||||
|
||||
SYMCRYPT_MAGIC_FIELD MACRO
|
||||
|
||||
if DBG
|
||||
magic dq ?
|
||||
endif
|
||||
|
||||
ENDM
|
||||
|
||||
SYMCRYPT_CODE_VERSION EQU ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR )
|
||||
SYMCRYPT_MAGIC_CONSTANT EQU ('S1mv' + SYMCRYPT_CODE_VERSION)
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC MACRO ptr, struct_name
|
||||
|
||||
if DBG
|
||||
|
||||
mov rax, [ptr + struct_name.magic]
|
||||
sub rax, ptr
|
||||
cmp rax, SYMCRYPT_MAGIC_CONSTANT
|
||||
jz @F
|
||||
mov ecx, 'magc'
|
||||
call SymCryptFatal
|
||||
@@:
|
||||
endif
|
||||
|
||||
ENDM
|
||||
|
||||
|
|
@ -1,171 +0,0 @@
|
|||
;
|
||||
; Wipe.asm
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
||||
include ksamd64.inc
|
||||
|
||||
TITLE wipe.asm
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
|
||||
; SIZE_T cbData )
|
||||
|
||||
;
|
||||
; This function allocates no stack space, calls no functions, and does not save
|
||||
; any non-volatile registers. Thusm it is a LEAF function
|
||||
;
|
||||
LEAF_ENTRY SymCryptWipeAsm, _TEXT
|
||||
|
||||
; rcx = pbData
|
||||
; rdx = cbData
|
||||
|
||||
;
|
||||
; This function will handle any alignment of pbData and any size, but it is optimized for
|
||||
; the case where the start and end of the buffer are 16-aligned.
|
||||
; 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
|
||||
; of 16 long without adding too much slack.
|
||||
; The cost of non-alignment is relatively low, in the order of 5 cycles or so
|
||||
;
|
||||
|
||||
xorps xmm0,xmm0 ; Zero register for 16-byte wipes
|
||||
cmp rdx,16
|
||||
jb SymCryptWipeAsmSmall ; if cbData < 16, this is a rare case
|
||||
|
||||
test rcx,15
|
||||
jnz SymCryptWipeAsmUnAligned; if data pointer is unaligned, we jump to the code that aligns the pointer
|
||||
; For well-optimized callers the aligned case is the common one, and that is
|
||||
; the fall-through.
|
||||
|
||||
SymCryptWipeAsmAligned:
|
||||
;
|
||||
; Here rcx is aligned, and rdx contains the # bytes left to wipe, and rdx >= 16
|
||||
;
|
||||
; Our loop wipes in 32-byte increments; we always wipe the first 16 bytes if
|
||||
; and increment the pbData pointer if cbData is 16 mod 32
|
||||
; This avoids a conditional jump and is faster.
|
||||
;
|
||||
test rdx,16
|
||||
movaps [rcx],xmm0 ; it is safe to always wipe as cbData >= 16
|
||||
lea r8,[rcx+16]
|
||||
cmovnz rcx,r8 ; only increment pbData if cbData = 16 mod 32
|
||||
|
||||
sub rdx,32 ; see if we have >= 32 bytes to wipe
|
||||
jc SymCryptWipeAsmTailOptional ; if not, wipe tail, or nothing if cbData = 0 mod 16
|
||||
|
||||
align 16
|
||||
|
||||
SymCryptWipeAsmLoop:
|
||||
movaps [rcx],xmm0
|
||||
movaps [rcx+16],xmm0 ; Wipe 32 bytes
|
||||
add rcx,32
|
||||
sub rdx,32
|
||||
jnc SymCryptWipeAsmLoop
|
||||
|
||||
SymCryptWipeAsmTailOptional:
|
||||
; only the lower 4 bits of rdx are valid, we have subtracted too much already.
|
||||
; The wipe was at least 16 bytes, so we can just wipe the tail in one instruction
|
||||
|
||||
and edx,15
|
||||
jnz SymCryptWipeAsmTail
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmTail:
|
||||
; This code appears also below at the end of the unaligned wiping routine
|
||||
; but making the jnz jump further is slower and we only duplicate 4 instructions.
|
||||
xor eax,eax
|
||||
mov [rcx+rdx-16],rax
|
||||
mov [rcx+rdx-8],rax
|
||||
ret
|
||||
|
||||
align 4
|
||||
SymCryptWipeAsmUnaligned:
|
||||
|
||||
;
|
||||
; At this point we know that cbData(rdx) >= 16 and pbData(rcx) is unaligned.
|
||||
; We can wipe 16 bytes and move to an aligned position
|
||||
;
|
||||
xor eax,eax
|
||||
mov [rcx],rax
|
||||
mov [rcx+8],rax
|
||||
|
||||
mov eax,ecx ;
|
||||
neg eax ; lower 4 bits of eax = # bytes to wipe to reach alignment
|
||||
and eax,15
|
||||
add rcx,rax
|
||||
sub rdx,rax
|
||||
|
||||
;
|
||||
; If rdx > 16, go to the aligned wiping loop
|
||||
;
|
||||
cmp rdx,16
|
||||
jae SymCryptWipeAsmAligned ; if cbData >= 16, do aligned wipes
|
||||
|
||||
;
|
||||
; We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
|
||||
; We just wipe the last 16 bytes completely.
|
||||
;
|
||||
xor eax,eax
|
||||
mov [rcx+rdx-16],rax
|
||||
mov [rcx+rdx-8],rax
|
||||
ret
|
||||
|
||||
|
||||
align 8
|
||||
SymCryptWipeAsmSmall:
|
||||
; rcx = pbData, possibly unaligned
|
||||
; rdx = cbData; rdx < 16
|
||||
;
|
||||
; With speculative execution attacks, the cost of a jump table is prohibitive.
|
||||
; We use a compare ladder for 5 cases:
|
||||
; 8-15 bytes
|
||||
; 4-7 bytes
|
||||
; 2-3 bytes
|
||||
; 1 byte
|
||||
; 0 bytes
|
||||
|
||||
xor eax,eax
|
||||
|
||||
cmp edx, 8
|
||||
jb SymCryptWipeAsmSmallLessThan8
|
||||
|
||||
; wipe 8-15 bytes using two possibly overlapping writes
|
||||
mov [rcx], rax
|
||||
mov [rcx + rdx - 8], rax
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan8:
|
||||
cmp edx, 4
|
||||
jb SymCryptWipeAsmSmallLessThan4
|
||||
|
||||
; wipe 4-7 bytes
|
||||
mov [rcx], eax
|
||||
mov [rcx + rdx - 4], eax
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan4:
|
||||
cmp edx, 2
|
||||
jb SymCryptWipeAsmSmallLessThan2
|
||||
|
||||
; wipe 2-3 bytes
|
||||
mov [rcx], ax
|
||||
mov [rcx + rdx - 2], ax
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan2:
|
||||
or edx, edx
|
||||
jz SymCryptWipeAsmSmallDone
|
||||
|
||||
; wipe 1 byte
|
||||
mov [rcx], al
|
||||
|
||||
SymCryptWipeAsmSmallDone:
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END SymCryptWipeAsm, _TEXT
|
||||
|
||||
END
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
//
|
||||
// wipe.symcryptasm Assembler code for wiping a buffer
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
|
||||
// SIZE_T cbData )
|
||||
|
||||
FUNCTION_START(SymCryptWipeAsm, 2, 4)
|
||||
|
||||
// Q1 = pbData
|
||||
// Q2 = cbData
|
||||
|
||||
//
|
||||
// This function will handle any alignment of pbData and any size, but it is optimized for
|
||||
// the case where the start and end of the buffer are 16-aligned.
|
||||
// 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
|
||||
// of 16 long without adding too much slack.
|
||||
// The cost of non-alignment is relatively low, in the order of 5 cycles or so
|
||||
//
|
||||
|
||||
xorps xmm0,xmm0 // Zero register for 16-byte wipes
|
||||
cmp Q2,16
|
||||
jb SymCryptWipeAsmSmall // if cbData < 16, this is a rare case
|
||||
|
||||
test Q1,15
|
||||
jnz SymCryptWipeAsmUnaligned // if data pointer is unaligned, we jump to the code that aligns the pointer
|
||||
// For well-optimized callers the aligned case is the common one, and that is
|
||||
// the fall-through.
|
||||
|
||||
SymCryptWipeAsmAligned:
|
||||
//
|
||||
// Here Q1 is aligned, and Q2 contains the # bytes left to wipe, and Q2 >= 16
|
||||
//
|
||||
// Our loop wipes in 32-byte increments; we always wipe the first 16 bytes
|
||||
// and increment the pbData pointer if cbData is 16 mod 32
|
||||
// This avoids a conditional jump and is faster.
|
||||
//
|
||||
test Q2,16
|
||||
movaps [Q1],xmm0 // it is safe to always wipe as cbData >= 16
|
||||
lea Q3,[Q1+16]
|
||||
cmovnz Q1,Q3 // only increment pbData if cbData = 16 mod 32
|
||||
|
||||
sub Q2,32 // see if we have >= 32 bytes to wipe
|
||||
jc SymCryptWipeAsmTailOptional // if not, wipe tail, or nothing if cbData = 0 mod 16
|
||||
|
||||
ALIGN(16)
|
||||
|
||||
SymCryptWipeAsmLoop:
|
||||
movaps [Q1],xmm0
|
||||
movaps [Q1+16],xmm0 // Wipe 32 bytes
|
||||
add Q1,32
|
||||
sub Q2,32
|
||||
jnc SymCryptWipeAsmLoop
|
||||
|
||||
SymCryptWipeAsmTailOptional:
|
||||
// only the lower 4 bits of Q2 are valid, we have subtracted too much already.
|
||||
// The wipe was at least 16 bytes, so we can just wipe the tail with 2 instructions
|
||||
|
||||
and D2,15
|
||||
jnz SymCryptWipeAsmTail
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmTail:
|
||||
// This code appears also below at the end of the unaligned wiping routine
|
||||
// but making the jnz jump further is slower and we only duplicate 4 instructions.
|
||||
xor D0,D0
|
||||
mov [Q1+Q2-16],Q0
|
||||
mov [Q1+Q2-8],Q0
|
||||
ret
|
||||
|
||||
ALIGN(4)
|
||||
|
||||
SymCryptWipeAsmUnaligned:
|
||||
|
||||
//
|
||||
// At this point we know that cbData(Q2) >= 16 and pbData(Q1) is unaligned.
|
||||
// We can wipe 16 bytes and move to an aligned position
|
||||
//
|
||||
xor D0,D0
|
||||
mov [Q1],Q0
|
||||
mov [Q1+8],Q0
|
||||
|
||||
mov D0,D1
|
||||
neg D0 // lower 4 bits of D0 = # bytes to wipe to reach alignment
|
||||
and D0,15
|
||||
add Q1,Q0
|
||||
sub Q2,Q0
|
||||
|
||||
//
|
||||
// If Q2 > 16, go to the aligned wiping loop
|
||||
//
|
||||
cmp Q2,16
|
||||
jae SymCryptWipeAsmAligned // if cbData >= 16, do aligned wipes
|
||||
|
||||
//
|
||||
// We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
|
||||
// We just wipe the last 16 bytes completely.
|
||||
//
|
||||
xor D0,D0
|
||||
mov [Q1+Q2-16],Q0
|
||||
mov [Q1+Q2-8],Q0
|
||||
ret
|
||||
|
||||
ALIGN(8)
|
||||
|
||||
SymCryptWipeAsmSmall:
|
||||
// Q1 = pbData, possibly unaligned
|
||||
// Q2 = cbData; Q2 < 16
|
||||
//
|
||||
// With speculative execution attacks, the cost of a jump table is prohibitive.
|
||||
// We use a compare ladder for 5 cases:
|
||||
// 8-15 bytes
|
||||
// 4-7 bytes
|
||||
// 2-3 bytes
|
||||
// 1 byte
|
||||
// 0 bytes
|
||||
|
||||
xor D0,D0
|
||||
|
||||
cmp D2, 8
|
||||
jb SymCryptWipeAsmSmallLessThan8
|
||||
|
||||
// wipe 8-15 bytes using two possibly overlapping writes
|
||||
mov [Q1],Q0
|
||||
mov [Q1+Q2-8],Q0
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan8:
|
||||
cmp D2, 4
|
||||
jb SymCryptWipeAsmSmallLessThan4
|
||||
|
||||
// wipe 4-7 bytes
|
||||
mov [Q1],D0
|
||||
mov [Q1+Q2-4],D0
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan4:
|
||||
cmp D2, 2
|
||||
jb SymCryptWipeAsmSmallLessThan2
|
||||
|
||||
// wipe 2-3 bytes
|
||||
mov [Q1],W0
|
||||
mov [Q1+Q2-2],W0
|
||||
ret
|
||||
|
||||
SymCryptWipeAsmSmallLessThan2:
|
||||
or D2,D2
|
||||
jz SymCryptWipeAsmSmallDone
|
||||
|
||||
// wipe 1 byte
|
||||
mov [Q1],B0
|
||||
|
||||
SymCryptWipeAsmSmallDone:
|
||||
|
||||
FUNCTION_END(SymCryptWipeAsm)
|
||||
|
||||
FILE_END()
|
|
@ -9,7 +9,11 @@
|
|||
#include "symcrypt_version.inc"
|
||||
#include "symcrypt_magic.inc"
|
||||
|
||||
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
|
||||
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
|
||||
#define SYMCRYPT_MASM
|
||||
#include "C_asm_shared.inc"
|
||||
#undef SYMCRYPT_MASM
|
||||
|
||||
; A digit consists of 4 words of 32 bits each
|
||||
|
||||
|
@ -449,11 +453,11 @@ SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
|||
|
||||
SQR_SINGLEADD_32 3
|
||||
|
||||
|
||||
|
||||
add r2, r2, #16
|
||||
add r4, r4, #16
|
||||
|
||||
adds r3, r3, #1 ; move one digit up
|
||||
adds r3, r3, #1 ; move one digit up
|
||||
bne SymCryptFdefRawSquareAsmInnerLoopInit_Word0
|
||||
|
||||
str r11, [r4] ; Store the next word into the destination
|
||||
|
@ -689,7 +693,7 @@ SymCryptFdefMontgomeryReduceAsmInner
|
|||
adds r11, r11, r7 ; c + pSrc[nWords] + hc
|
||||
adc r8, r8, #0 ; Add the carry if any
|
||||
str r11, [r1], #4 ; pSrc[nWords] = c
|
||||
|
||||
|
||||
adds r12, r12, r6 ; c + pSrc[nWords+1]
|
||||
adc r9, r9, #0 ; Add the carry if any
|
||||
adds r12, r12, r8 ; c + pSrc[nWords] + hc
|
||||
|
@ -701,7 +705,7 @@ SymCryptFdefMontgomeryReduceAsmInner
|
|||
add r2, r2, #8 ; Move stored pSrc pointer two words up
|
||||
ldr r0, [sp, #pMod] ; Restore the pMod pointer
|
||||
mov r1, r2 ; Restore the pSrc pointer
|
||||
|
||||
|
||||
bne SymCryptFdefMontgomeryReduceAsmOuter
|
||||
|
||||
;
|
||||
|
|
|
@ -16,7 +16,11 @@
|
|||
#include "symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_magic.inc"
|
||||
|
||||
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
|
||||
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
|
||||
#define SYMCRYPT_MASM
|
||||
#include "C_asm_shared.inc"
|
||||
#undef SYMCRYPT_MASM
|
||||
|
||||
; A digit consists of 3 words of 64 bits each
|
||||
|
||||
|
@ -213,7 +217,7 @@ SymCryptFdef369RawMulAsmLoopInner1
|
|||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
|
||||
cbnz x3, SymCryptFdef369RawMulAsmLoopInner1
|
||||
|
||||
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
|
||||
|
|
|
@ -10,7 +10,11 @@
|
|||
#include "symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_magic.inc"
|
||||
|
||||
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
|
||||
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
|
||||
#define SYMCRYPT_MASM
|
||||
#include "C_asm_shared.inc"
|
||||
#undef SYMCRYPT_MASM
|
||||
|
||||
; A digit consists of 4 words of 64 bits each
|
||||
|
||||
|
|
|
@ -517,11 +517,11 @@ SymCryptFdefIntSetValueUint64(
|
|||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSetValue(
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_writes_(nWords) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
SYMCRYPT_ERROR scError;
|
||||
UINT32 b;
|
||||
|
@ -611,11 +611,11 @@ SymCryptFdefIntSetValue(
|
|||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawGetValue(
|
||||
_In_reads_(nWords) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format )
|
||||
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format )
|
||||
{
|
||||
SYMCRYPT_ERROR scError;
|
||||
UINT32 b;
|
||||
|
|
|
@ -722,11 +722,11 @@ SymCryptFdefIntSquare(
|
|||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulC(
|
||||
_In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
_In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
{
|
||||
UINT32 nWords1 = nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32;
|
||||
UINT32 nWords2 = nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32;
|
||||
|
@ -778,9 +778,9 @@ SymCryptFdefRawMul(
|
|||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareC(
|
||||
_In_reads_(nWords) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst )
|
||||
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
{
|
||||
UINT32 nWords = nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32;
|
||||
|
||||
|
|
|
@ -1223,7 +1223,7 @@ SymCryptFdefModMulMontgomery(
|
|||
SymCryptFdefMontgomeryReduce( pmMod, pTmp, &peDst->d.uint32[0] );
|
||||
}
|
||||
|
||||
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModMulMontgomeryMulx(
|
||||
|
@ -1283,7 +1283,7 @@ SymCryptFdefModSquareMontgomery(
|
|||
}
|
||||
|
||||
|
||||
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModSquareMontgomeryMulx(
|
||||
|
@ -1356,70 +1356,12 @@ SymCryptFdefModInvMontgomery(
|
|||
return scError;
|
||||
}
|
||||
|
||||
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
|
||||
#if SYMCRYPT_CPU_AMD64
|
||||
|
||||
//=====================================
|
||||
// 256-bit Montgomery modulus code
|
||||
//
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModAdd256Test(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc1,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc2,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch )
|
||||
{
|
||||
SYMCRYPT_ASYM_ALIGN BYTE buf1[128];
|
||||
SYMCRYPT_ASYM_ALIGN BYTE buf2[128];
|
||||
PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
|
||||
PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
|
||||
|
||||
(VOID) peTmp1;
|
||||
(VOID) peTmp2;
|
||||
|
||||
SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
|
||||
SymCryptFdefModAddGeneric( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch );
|
||||
|
||||
if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
|
||||
{
|
||||
SymCryptFatal( 42 );
|
||||
}
|
||||
|
||||
SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModMulMontgomery256Test(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc1,
|
||||
_In_ PCSYMCRYPT_MODELEMENT peSrc2,
|
||||
_Out_ PSYMCRYPT_MODELEMENT peDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch )
|
||||
{
|
||||
SYMCRYPT_ASYM_ALIGN BYTE buf1[128];
|
||||
SYMCRYPT_ASYM_ALIGN BYTE buf2[128];
|
||||
PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
|
||||
PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
|
||||
|
||||
(VOID) peTmp1;
|
||||
(VOID) peTmp2;
|
||||
|
||||
SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
|
||||
//SymCryptFdefModMulMontgomery( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); *** This doesn't produce the same result as it reduces a whole digit, not 256 bits
|
||||
|
||||
if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
|
||||
{
|
||||
// SymCryptFatal( 42 );
|
||||
}
|
||||
|
||||
SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModSquareMontgomery256(
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
;
|
||||
; fdef_asm.asm Assembler code for fast arithmetic
|
||||
; fdef_asm.cppasm Assembler code for fast arithmetic
|
||||
; Requires C preprocessor to correctly include C_asm_shared.inc
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
@ -11,9 +12,9 @@
|
|||
;
|
||||
; FPO documentation:
|
||||
; The .FPO provides debugging information.
|
||||
; This stuff not well documented,
|
||||
; This stuff not well documented,
|
||||
; but here is the information I've gathered about the arguments to .FPO
|
||||
;
|
||||
;
|
||||
; In order:
|
||||
; cdwLocals: Size of local variables, in DWords
|
||||
; cdwParams: Size of parameters, in DWords. Given that this is all about
|
||||
|
@ -23,7 +24,7 @@
|
|||
; prolog code with work for better performance. Most uses of
|
||||
; .FPO seem to set this value to 0 anyway, which is what we
|
||||
; will do.
|
||||
; cbRegs : # registers saved in the prolog.
|
||||
; cbRegs : # registers saved in the prolog.
|
||||
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
|
||||
; cbFrame : Type of frame.
|
||||
; 0 = FPO frame (no frame pointer)
|
||||
|
@ -43,7 +44,7 @@ _TEXT SEGMENT PARA PUBLIC USE32 'CODE'
|
|||
include symcrypt_version.inc
|
||||
include symcrypt_magic.inc
|
||||
|
||||
include C_asm_shared.inc
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
PUBLIC @SymCryptFdefRawAddAsm@16
|
||||
PUBLIC @SymCryptFdefRawSubAsm@16
|
||||
|
@ -60,7 +61,7 @@ BEFORE_PROC MACRO
|
|||
;
|
||||
DB 5 dup (0cch)
|
||||
ENDM
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -86,7 +87,7 @@ pDst dd ?
|
|||
nDigits dd ?
|
||||
|
||||
SymCryptFdefRawAddAsmFrame ends
|
||||
|
||||
|
||||
; ecx = pSrc1
|
||||
; edx = pSrc2
|
||||
|
||||
|
@ -129,7 +130,7 @@ SymCryptFdefRawAddAsmLoop:
|
|||
pop edi
|
||||
pop ebx
|
||||
ret 8
|
||||
|
||||
|
||||
@SymCryptFdefRawAddAsm@16 ENDP
|
||||
|
||||
|
||||
|
@ -154,7 +155,7 @@ pDst dd ?
|
|||
nDigits dd ?
|
||||
|
||||
SymCryptFdefRawSubAsmFrame ends
|
||||
|
||||
|
||||
; ecx = pSrc1
|
||||
; edx = pSrc2
|
||||
|
||||
|
@ -197,7 +198,7 @@ SymCryptFdefRawSubAsmLoop:
|
|||
pop edi
|
||||
pop ebx
|
||||
ret 8
|
||||
|
||||
|
||||
@SymCryptFdefRawSubAsm@16 ENDP
|
||||
|
||||
|
||||
|
@ -305,8 +306,8 @@ SymCryptFdefRawMulAsmFrame ends
|
|||
; for each word in Src1:
|
||||
; Dst += Src2 * word
|
||||
; Register assignments
|
||||
;
|
||||
; eax = tmp/lower half of mult
|
||||
;
|
||||
; eax = tmp/lower half of mult
|
||||
; ebx = multiplicant
|
||||
; ecx = loop counter, initialized to nDigits2
|
||||
; edx = upper half of mult
|
||||
|
@ -315,7 +316,7 @@ SymCryptFdefRawMulAsmFrame ends
|
|||
; ebp = carry
|
||||
;
|
||||
; esp + pSrc1 running pointer into Src1
|
||||
; esp +
|
||||
; esp +
|
||||
|
||||
|
||||
mov edi,edi
|
||||
|
@ -436,7 +437,7 @@ SymCryptFdefRawMulAsmLoop2:
|
|||
adc edx, 0
|
||||
mov [edi + 12], eax
|
||||
mov ebp, edx
|
||||
|
||||
|
||||
add esi, 16
|
||||
add edi, 16
|
||||
sub ecx,1
|
||||
|
@ -477,7 +478,7 @@ SymCryptFdefMontgomeryReduceAsmFrame struct 4, NONUNIQUE
|
|||
HighCarry dd ?
|
||||
pSrc dd ?
|
||||
pModValue dd ?
|
||||
nWords dd ?
|
||||
nWords dd ?
|
||||
SaveEbp dd ? ; # words still to process in outer loop
|
||||
SaveEsi dd ?
|
||||
SaveEdi dd ?
|
||||
|
@ -513,13 +514,13 @@ SymCryptFdefMontgomeryReduceAsmFrame ends
|
|||
SymCryptFdefMontgomeryReduceOuterLoop:
|
||||
; eax = <undef>
|
||||
; ebx = <undef>
|
||||
; ecx = <undef>
|
||||
; ecx = <undef>
|
||||
; edx = <undef>
|
||||
; esi = start of mod value
|
||||
; edi = pSrc + 4 * loop iteration count
|
||||
; ebp = <undef>
|
||||
|
||||
; compute multiplier for this outer loop iteration.
|
||||
; compute multiplier for this outer loop iteration.
|
||||
mov ebx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusMontgomeryInv64OffsetX86 ]
|
||||
imul ebx, [edi] ; word we want to zero out, ebx = multiplier for this inner loop
|
||||
|
||||
|
@ -529,7 +530,7 @@ SymCryptFdefMontgomeryReduceOuterLoop:
|
|||
SymCryptFdefMontgomeryReduceInnerLoop:
|
||||
; eax = mul scratch
|
||||
; ebx = multiplier
|
||||
; ecx = digit counter
|
||||
; ecx = digit counter
|
||||
; edx = mul scratch
|
||||
; esi = running pointer to mod value
|
||||
; edi = running pointer to input/scratch
|
||||
|
@ -570,7 +571,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
|
|||
adc edx, 0
|
||||
mov [edi + 12], eax
|
||||
mov ebp, edx
|
||||
|
||||
|
||||
add esi, 16
|
||||
add edi, 16
|
||||
sub ecx,1
|
||||
|
@ -606,7 +607,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
|
|||
|
||||
mov ecx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusNdigitsOffsetX86] ; loop counter
|
||||
mov edx, [esp + SymCryptFdefMontgomeryReduceAsmFrame.pDst];
|
||||
|
||||
|
||||
; ecx = nDigits
|
||||
|
||||
; Save some values for the copy loop
|
|
@ -1,314 +0,0 @@
|
|||
;
|
||||
; rc4asm.asm
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
; RC4 implementation in x86 assembler
|
||||
; This is a new RC4 implementation for SymCrypt.
|
||||
; It is NOT based on the existing one in RSA32.lib.
|
||||
;
|
||||
|
||||
|
||||
TITLE "RC4"
|
||||
.586P
|
||||
|
||||
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
|
||||
ASSUME CS:_TEXT, DS:FLAT, SS:FLAT
|
||||
|
||||
include symcrypt_version.inc
|
||||
include symcrypt_magic.inc
|
||||
|
||||
;
|
||||
; Structure definition that mirrors the SYMCRYPT_RC4_STATE struct
|
||||
;
|
||||
|
||||
RC4_STATE struct
|
||||
S db 256 dup (?)
|
||||
i db ?
|
||||
j db ?
|
||||
|
||||
SYMCRYPT_MAGIC_FIELD
|
||||
|
||||
RC4_STATE ends
|
||||
|
||||
|
||||
PUBLIC @SymCryptRc4InitAsm@12
|
||||
PUBLIC @SymCryptRc4CryptAsm@16
|
||||
|
||||
|
||||
BEFORE_PROC MACRO
|
||||
;
|
||||
; Our current x86 compiler inserts 5 0xcc bytes before every function
|
||||
; and starts every function with a 2-byte NOP.
|
||||
; This supports hot-patching.
|
||||
;
|
||||
DB 5 dup (0cch)
|
||||
ENDM
|
||||
|
||||
|
||||
; The .FPO provides debugging information.
|
||||
; This stuff not well documented,
|
||||
; but here is the information I've gathered about the arguments to .FPO
|
||||
;
|
||||
; In order:
|
||||
; cdwLocals: Size of local variables, in DWords
|
||||
; cdwParams: Size of parameters, in DWords. Given that this is all about
|
||||
; stack stuff, I'm assuming this is only about parameters passed
|
||||
; on the stack.
|
||||
; cbProlog : Number of bytes in the prolog code. We have interleaved the
|
||||
; prolog code with work for better performance. Most uses of
|
||||
; .FPO seem to set this value to 0 anyway, which is what we
|
||||
; will do.
|
||||
; cbRegs : # registers saved in the prolog. 4 in our case
|
||||
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
|
||||
; cbFrame : Type of frame.
|
||||
; 0 = FPO frame (no frame pointer)
|
||||
; 1 = Trap frame (result of a CPU trap event)
|
||||
; 2 = TSS frame
|
||||
;
|
||||
; Having looked at various occurrences of .FPO in the Windows code it
|
||||
; seems to be used fairly sloppy, with lots of arguments left 0 even when
|
||||
; they probably shouldn't be according to the spec.
|
||||
;
|
||||
|
||||
|
||||
|
||||
BEFORE_PROC
|
||||
|
||||
@SymCryptRc4InitAsm@12 PROC
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptRc4InitAsm(
|
||||
; _Out_ PSYMCRYPT_RC4_STATE pState,
|
||||
; _In_reads_bytes_( cbKey ) PCBYTE pbKey,
|
||||
; _In_ SIZE_T cbKey );
|
||||
;
|
||||
; NOTE: Unlike the SymCryptRc4Init function
|
||||
; this function does not check the cbKey validity, and does not return an error code.
|
||||
; Currently we don't have the error code values symbolically in the asm environment.
|
||||
; We use an inlined function to generate the errors instead, and call this function
|
||||
; only when there are no errors.
|
||||
;
|
||||
|
||||
Rc4InitFrame struct 4, NONUNIQUE
|
||||
|
||||
pbKey dd ?
|
||||
SaveEdi dd ?
|
||||
SaveEsi dd ?
|
||||
SaveEbp dd ?
|
||||
SaveEbx dd ?
|
||||
ReturnAddress dd ?
|
||||
cbKey dd ?
|
||||
|
||||
Rc4InitFrame ends
|
||||
|
||||
.FPO(5,1,0,4,0,0)
|
||||
|
||||
; ecx = pState
|
||||
; edx = pKey
|
||||
; [esp + 4] = cbKey
|
||||
|
||||
;
|
||||
; Set up stack frame, and initialize pbKey
|
||||
;
|
||||
mov edi,edi ; 2-byte NOP for hot-patching
|
||||
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
push edx
|
||||
|
||||
;
|
||||
; Initialize S[i] = i
|
||||
;
|
||||
lea esi,[ecx + 100h]
|
||||
mov edi,ecx
|
||||
|
||||
mov eax,03020100h
|
||||
mov ebx,04040404h
|
||||
|
||||
@@:
|
||||
mov [edi],eax
|
||||
add eax,ebx
|
||||
mov [edi+4],eax
|
||||
add eax,ebx
|
||||
mov [edi+8],eax
|
||||
add eax,ebx
|
||||
mov [edi+12],eax
|
||||
add eax,ebx
|
||||
add edi,16
|
||||
cmp edi,esi
|
||||
jb @B
|
||||
|
||||
|
||||
mov ebp,edx
|
||||
xor ebx,ebx ; j = 0
|
||||
xor esi,esi ; i = 0
|
||||
mov edi,[esp + Rc4InitFrame.cbKey]
|
||||
add edi, edx ; edi = pbKey + cbKey
|
||||
|
||||
SymCryptRc4InitMainLoop:
|
||||
; Registers:
|
||||
; eax = Tmp1
|
||||
; ebx = j
|
||||
; ecx = S
|
||||
; edx = Tmp2
|
||||
; esi = i
|
||||
; edi = keyLimit ; just beyond the key
|
||||
; ebp = pKey ; pointer to current key byte
|
||||
|
||||
movzx edx,byte ptr[ebp] ; get key byte
|
||||
add ebx,edx ; j += key byte
|
||||
movzx eax,byte ptr[ecx + esi] ; get S[i]
|
||||
add ebx,eax ; j += S[i]
|
||||
|
||||
and ebx,0ffh
|
||||
|
||||
movzx edx,byte ptr [ecx + ebx]; get S[j]
|
||||
mov byte ptr[ecx + ebx], al ; update S[j]
|
||||
mov byte ptr[ecx + esi], dl ; update S[i]
|
||||
|
||||
add ebp,1 ; increment key pointer modulo key length
|
||||
cmp ebp,edi
|
||||
jb @F
|
||||
mov ebp,[esp + Rc4InitFrame.pbKey]
|
||||
@@:
|
||||
|
||||
add esi,1 ; increment i
|
||||
cmp esi,100h
|
||||
jb SymCryptRc4InitMainLoop
|
||||
|
||||
mov word ptr [ecx + RC4_STATE.i], 1 ; i = 1; j = 0
|
||||
|
||||
add esp,4
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
ret 4
|
||||
|
||||
|
||||
@SymCryptRc4InitAsm@12 ENDP
|
||||
|
||||
|
||||
|
||||
|
||||
BEFORE_PROC
|
||||
|
||||
@SymCryptRc4CryptAsm@16 PROC
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptRc4Crypt(
|
||||
; _Inout_ PSYMCRYPT_RC4_STATE pState,
|
||||
; _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
; _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
; _In_ SIZE_T cbData )
|
||||
|
||||
Rc4CryptFrame struct 4, NONUNIQUE
|
||||
pbEndDst dd ?
|
||||
SaveEdi dd ?
|
||||
SaveEsi dd ?
|
||||
SaveEbp dd ?
|
||||
SaveEbx dd ?
|
||||
ReturnAddress dd ?
|
||||
pbDst dd ?
|
||||
cbData dd ?
|
||||
|
||||
Rc4CryptFrame ends
|
||||
|
||||
.FPO(5,2,0,4,0,0)
|
||||
|
||||
|
||||
mov edi,edi
|
||||
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
sub esp,4
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC ecx, RC4_STATE
|
||||
|
||||
mov eax,[esp + Rc4CryptFrame.cbData]
|
||||
test eax,eax
|
||||
jz Rc4CryptDoNothing
|
||||
|
||||
mov ebp,[esp + Rc4CryptFrame.pbDst]
|
||||
add eax,ebp
|
||||
mov [esp + Rc4CryptFrame.pbEndDst], eax
|
||||
|
||||
mov edi, edx
|
||||
movzx edx,[ecx + RC4_STATE.i]
|
||||
movzx esi,[ecx + RC4_STATE.j]
|
||||
|
||||
;
|
||||
; Further perf improvements are possible.
|
||||
; Instead of encrypting byte-by-byte, we can collect 4 bytes of the key
|
||||
; stream in a register, and then encrypt 4 bytes at a time.
|
||||
; This reduces the # memory operations we do per byte.
|
||||
; Ideally this is done with aligned operations, either
|
||||
; aligning to pbSrc, pbDst, or to i (which removes the need to increment i every time).
|
||||
;
|
||||
|
||||
@@:
|
||||
; eax Ti
|
||||
; ebx Tj
|
||||
; ecx S
|
||||
; edx i
|
||||
; esi j
|
||||
; edi pSrc
|
||||
; ebp pDst
|
||||
|
||||
movzx eax, byte ptr[ecx + edx] ; Ti = S[i]
|
||||
|
||||
;add esi, eax
|
||||
;and esi, 0ffh
|
||||
lea ebx, [esi + eax]
|
||||
movzx esi, bl ; j += Ti
|
||||
|
||||
movzx ebx, byte ptr[ecx + esi] ; Tj = S[j]
|
||||
mov [ecx + edx], bl ; S[i] = Tj
|
||||
mov [ecx + esi], al ; S[j] = Ti
|
||||
|
||||
;add eax,ebx
|
||||
;and eax,0ffh
|
||||
lea eax,[eax + ebx]
|
||||
movzx eax,al ; Ti = Ti + Tj
|
||||
|
||||
mov al,[ecx + eax] ; Til = S[Ti]
|
||||
|
||||
;add edx, 1
|
||||
;and 0ffh
|
||||
lea edx,[edx + 1]
|
||||
movzx edx,dl ; i += 1
|
||||
|
||||
xor al,[edi]
|
||||
add edi,1
|
||||
mov [ebp],al
|
||||
add ebp, 1
|
||||
|
||||
cmp ebp,[esp + Rc4CryptFrame.pbEndDst]
|
||||
jb @B
|
||||
|
||||
mov eax, esi
|
||||
mov [ecx + RC4_STATE.i], dl
|
||||
mov [ecx + RC4_STATE.j], al
|
||||
|
||||
Rc4CryptDoNothing:
|
||||
|
||||
add esp,4
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
ret 8
|
||||
|
||||
|
||||
@SymCryptRc4CryptAsm@16 ENDP
|
||||
|
||||
|
||||
|
||||
_TEXT ENDS
|
||||
|
||||
END
|
|
@ -1,383 +0,0 @@
|
|||
;
|
||||
; Sha1Asm.Asm
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
;
|
||||
|
||||
;
|
||||
; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
|
||||
; for the x86 processor architecture.
|
||||
;
|
||||
; This implementation is derived from an older one by Scott Field and
|
||||
; Dan Shumow.
|
||||
;
|
||||
; This implementation is optimized for Intel Core and contemporary AMD CPUs.
|
||||
; Optimizations for pre-P3 Intel CPUs has been removed.
|
||||
;
|
||||
|
||||
|
||||
TITLE sha1asm.asm
|
||||
.486
|
||||
|
||||
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
|
||||
ASSUME CS:_TEXT, DS:FLAT, SS:FLAT
|
||||
|
||||
PUBLIC @SymCryptSha1AppendBlocksAsm@12
|
||||
|
||||
;
|
||||
; The four round constants used by SHA-1
|
||||
;
|
||||
|
||||
K0_19 EQU 05a827999H
|
||||
K20_39 EQU 06ed9eba1H
|
||||
K40_59 EQU 08f1bbcdcH
|
||||
K60_79 EQU 0ca62c1d6H
|
||||
|
||||
align 16
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H,
|
||||
; _In_reads_bytes_( cbData ) PCBYTE pbData,
|
||||
; SIZE_T cbData )
|
||||
;
|
||||
@SymCryptSha1AppendBlocksAsm@12 PROC
|
||||
|
||||
;
|
||||
; To keep stack manipulatins simple we define a structure and use that for all accesses.
|
||||
;
|
||||
SymCryptSha1AppendBlocksFrame struct 4, NONUNIQUE
|
||||
|
||||
Wbuf dd 16 dup (?)
|
||||
Hptr dd ?
|
||||
pbData dd ?
|
||||
BlockCount dd ?
|
||||
SaveEdi dd ?
|
||||
SaveEsi dd ?
|
||||
SaveEbp dd ?
|
||||
SaveEbx dd ?
|
||||
ReturnAddress dd ?
|
||||
CbData dd ?
|
||||
|
||||
SymCryptSha1AppendBlocksFrame ends
|
||||
|
||||
;
|
||||
; We use the W buffer extensively; this is a shorthand for the base address
|
||||
;
|
||||
W equ esp+SymCryptSha1AppendBlocksFrame.Wbuf
|
||||
|
||||
;
|
||||
; The .FPO provides debugging information for stack frames that do not use
|
||||
; ebp as a base pointer.
|
||||
; This stuff not well documented,
|
||||
; but here is the information I've gathered about the arguments to .FPO
|
||||
;
|
||||
; In order:
|
||||
; cdwLocals: Size of local variables, in DWords
|
||||
; cdwParams: Size of parameters, in DWords. Given that this is all about
|
||||
; stack stuff, I'm assuming this is only about parameters passed
|
||||
; on the stack.
|
||||
; cbProlog : Number of bytes in the prolog code. We sometimes interleaved the
|
||||
; prolog code with work for better performance. Most uses of
|
||||
; .FPO seem to set this value to 0.
|
||||
; The debugger seems to work if the prolog defined by this value
|
||||
; contains all the stack adjustments.
|
||||
; cbRegs : # registers saved in the prolog. 4 in our case
|
||||
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
|
||||
; cbFrame : Type of frame.
|
||||
; 0 = FPO frame (no frame pointer)
|
||||
; 1 = Trap frame (result of a CPU trap event)
|
||||
; 2 = TSS frame
|
||||
;
|
||||
; Having looked at various occurrences of .FPO in the Windows code it
|
||||
; seems to be used fairly sloppy, with lots of arguments left 0 even when
|
||||
; they probably shouldn't be according to the spec.
|
||||
;
|
||||
.FPO(23,1,3,4,0,0) ; 3 byte prolog (covers esp ajustment only)
|
||||
|
||||
; At this point:
|
||||
; ecx = H
|
||||
; edx = pbData
|
||||
; [esp+4] = cbData
|
||||
|
||||
;
|
||||
; Set up our stack frame and save non-volatile registers
|
||||
;
|
||||
sub esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbp],ebp
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEdi],edi
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEsi],esi
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbx],ebx
|
||||
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.Hptr], ecx
|
||||
|
||||
;
|
||||
; To allow macro re-ordering of our registers we use symbolic names
|
||||
; for the registers.
|
||||
; r0-r4 are the 5 state registers. x1 and x2 are extra scratch registers.
|
||||
; Note: some prolog code puts the right value in the right register and
|
||||
; has to be updated if this mapping is changed.
|
||||
;
|
||||
r0 EQU eax
|
||||
r1 EQU ebx
|
||||
r2 EQU ecx
|
||||
r3 EQU edx
|
||||
r4 EQU esi
|
||||
x1 EQU ebp
|
||||
x2 EQU edi
|
||||
|
||||
;
|
||||
; compute how many blocks we will process.
|
||||
; This code ensures that we never exceed the data buffer we were given,
|
||||
; although we silently round the cbData parameter down to the next
|
||||
; multiple of 64.
|
||||
; Do nothing if no blocks need to be processed.
|
||||
;
|
||||
mov eax,[esp+SymCryptSha1AppendBlocksFrame.CbData]
|
||||
shr eax,6
|
||||
jz SymCryptSha1AppendBlocksDone
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.BlockCount], eax
|
||||
|
||||
;
|
||||
; The data pointer goes into x1 = ebp at the start of our loop
|
||||
;
|
||||
mov ebp,edx
|
||||
|
||||
;
|
||||
; Load the H state from [ecx], making sure we load the r2=ecx register
|
||||
; last.
|
||||
;
|
||||
mov r0,[ecx ]
|
||||
mov r4,[ecx+ 4]
|
||||
mov r3,[ecx+ 8]
|
||||
mov r1,[ecx+16]
|
||||
mov r2,[ecx+12]
|
||||
|
||||
|
||||
SymCryptSha1AppendBlocksLoop:
|
||||
;
|
||||
; This is the main loop. We process 64 bytes in each iteration.
|
||||
; invariant: ebp = pbData
|
||||
;
|
||||
|
||||
;
|
||||
; Most of the code in the loop is generated through macros using parameters to
|
||||
; rename the registers.
|
||||
; The macros get the register number passed as parameter. They use
|
||||
; "r&<param>" to paste the number and the 'r' together and get the register
|
||||
; name we defined above.
|
||||
;
|
||||
|
||||
ROUND_CH_0_15 MACRO round,ra,rb,rc,rd,re,x1,x2
|
||||
;
|
||||
; Code for round 0-15.
|
||||
; This code loads data from the data buffer & BSWAPs the data to get it into the
|
||||
; right form.
|
||||
;
|
||||
; Parameters:
|
||||
; round round number
|
||||
; ra register number that contains the a value
|
||||
; rb register number that contains the b value
|
||||
; rc register number that contains the c value
|
||||
; rd register number that contains the d value
|
||||
; re register number that contains the e value
|
||||
; x1 pointer to the input data
|
||||
; x2 scratch register.
|
||||
;
|
||||
; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
|
||||
; We start with the d value as that is the oldest value and available the first
|
||||
;
|
||||
; See FIPS 180-2 for our symbolic notation.
|
||||
;
|
||||
mov x2,[x1+4*round] ; Fetch word from message
|
||||
bswap x2 ; x2 = Wt
|
||||
add r&re,x2 ; re = e + Wt
|
||||
mov [W + 4*round],x2 ; Store in W buffer for future use
|
||||
|
||||
mov x2,r&ra ; x2 = a
|
||||
rol x2,5 ; x2 = ROL(a,5)
|
||||
add r&re,x2 ; re = e + Wt + ROL(a,5)
|
||||
|
||||
mov x2,r&rd ; x2 = d
|
||||
xor x2,r&rc ; x2 = (d ^ c)
|
||||
and x2,r&rb ; x2 = ((d ^ c) & b)
|
||||
ror r&rb,2 ; rb = ROL( b, 30 )
|
||||
xor x2,r&rd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
|
||||
lea r&re,[r&re+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
|
||||
|
||||
ENDM
|
||||
|
||||
ROUND_CH MACRO round, ra, rb, rc, rd, re, x1, x2
|
||||
;
|
||||
; See ROUND_CH_0_15 for most parameters.
|
||||
; x1 and x2 are both scratch registers
|
||||
;
|
||||
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
|
||||
mov x1,r&ra ; x1 = a
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
|
||||
add r&re,x1 ; re = e + ROL(a,5)
|
||||
mov x1,r&rd ; x1 = d
|
||||
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
|
||||
xor x1,r&rc ; x1 = (d ^ c)
|
||||
and x1,r&rb ; x1 = ((d ^ c) & b)
|
||||
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
|
||||
xor x1,r&rd ; x1 = ((d ^ c) & b) ^ d = CH(b,c,d)
|
||||
rol x2,1 ; x2 = Wt
|
||||
mov [W+4*((round-16) MOD 16)],x2 ;
|
||||
add r&re,x2 ; re = e + ROL(a,5) + Wt
|
||||
ror r&rb,2 ; rb = ROL( b, 30 )
|
||||
lea r&re,[r&re+x1+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
|
||||
ENDM
|
||||
|
||||
ROUND_PARITY MACRO round, ra, rb, rc, rd, re, x1, x2, K, store
|
||||
;
|
||||
; See ROUND_CH for most parameters
|
||||
; K is the round constant to use.
|
||||
; store is 1 if the Wt value should be stored, 0 otherwise
|
||||
; (used to avoid stores in the last few rounds)
|
||||
;
|
||||
; The order of xorring the registers b, c, and d is driven by the data dependency graph.
|
||||
; We start with d (the oldest) and then do b to unblock the subsequent rotate
|
||||
;
|
||||
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
|
||||
mov x1,r&ra ; x1 = a
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
|
||||
add r&re,x1 ; re = e + ROL(a,5)
|
||||
mov x1,r&rd ; x1 = d
|
||||
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
|
||||
xor x1,r&rb ; x1 = (d ^ b)
|
||||
xor x1,r&rc ; x1 = (d ^ b ^ c) = Parity(b,c,d)
|
||||
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
|
||||
rol x2,1 ; x2 = Wt
|
||||
add r&re,x1 ; re = e + ROL(a,5) + Parity(b,c,d)
|
||||
IF store
|
||||
mov [W+4*((round-16) MOD 16)],x2 ;
|
||||
ENDIF
|
||||
ror r&rb,2 ; rb = ROL( b, 30 )
|
||||
lea r&re,[r&re+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
|
||||
|
||||
ENDM
|
||||
|
||||
ROUND_MAJ MACRO round, ra, rb, rc, rd, re, x1, x2
|
||||
;
|
||||
; See above for parameter explanation
|
||||
;
|
||||
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
|
||||
mov x1,r&ra ; x1 = a
|
||||
rol x1,5 ; x1 = ROL(a,5)
|
||||
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
|
||||
add r&re,x1 ; re = e + ROL(a,5)
|
||||
mov x1,r&rd ; x1 = d
|
||||
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
|
||||
or x1,r&rc ; x1 = (d | c)
|
||||
and x1,r&rb ; x1 = ((d | c) & b)
|
||||
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] = Wt
|
||||
rol x2,1 ; x2 = Wt
|
||||
add r&re,x2 ; re = e + ROL(a,5) + Wt
|
||||
mov [W+4*((round-16) MOD 16)],x2 ;
|
||||
|
||||
mov x2,r&rc ; x2 = c
|
||||
and x2,r&rd ; x2 = (c & d)
|
||||
or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
|
||||
|
||||
ror r&rb,2 ; rb = ROL( b, 30 )
|
||||
|
||||
lea r&re,[r&re+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
|
||||
ENDM
|
||||
|
||||
;
|
||||
; With these macros we can now produce the actual code.
|
||||
; Note the use of the % operator which evaluates the expression and yields the result as text.
|
||||
; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
|
||||
; for each round.
|
||||
;
|
||||
FOR t, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
|
||||
ROUND_CH_0_15 t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
|
||||
ENDM
|
||||
|
||||
;
|
||||
; For the rest of the computation we need the extra register, so we update the data pointer and store it.
|
||||
;
|
||||
add ebp,64
|
||||
mov [esp+SymCryptSha1AppendBlocksFrame.pbData], ebp
|
||||
|
||||
FOR t, <16, 17, 18, 19>
|
||||
ROUND_CH t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
|
||||
ENDM
|
||||
|
||||
FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
|
||||
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K20_39, 1
|
||||
ENDM
|
||||
|
||||
FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
|
||||
ROUND_MAJ t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
|
||||
ENDM
|
||||
|
||||
FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76>
|
||||
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 1
|
||||
ENDM
|
||||
|
||||
;
|
||||
; The last three rounds do not need to store their Wt in the W buffer as that value will never get used.
|
||||
;
|
||||
FOR t, <77, 78, 79>
|
||||
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 0
|
||||
ENDM
|
||||
|
||||
;
|
||||
; Now we update the state
|
||||
;
|
||||
mov x2,[esp+SymCryptSha1AppendBlocksFrame.Hptr]
|
||||
add r0,[x2 ]
|
||||
add r4,[x2+ 4]
|
||||
add r3,[x2+ 8]
|
||||
add r2,[x2+12]
|
||||
add r1,[x2+16]
|
||||
|
||||
mov [x2 ], r0
|
||||
mov [x2+ 4], r4
|
||||
mov [x2+ 8], r3
|
||||
mov [x2+12], r2
|
||||
mov [x2+16], r1
|
||||
|
||||
;
|
||||
; See if we have more data to process, and load the data pointer register again
|
||||
;
|
||||
dec [esp+SymCryptSha1AppendBlocksFrame.BlockCount]
|
||||
mov ebp, [esp+SymCryptSha1AppendBlocksFrame.pbData]
|
||||
jnz SymCryptSha1AppendBlocksLoop
|
||||
|
||||
;
|
||||
; We're done processing the blocks. The result is already in the state, so all we have to do
|
||||
; is clean up.
|
||||
;
|
||||
; Wipe the W buffer
|
||||
; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
|
||||
;
|
||||
mov ecx,8
|
||||
xor eax,eax
|
||||
@@: dec ecx
|
||||
mov [esp+8*ecx],eax
|
||||
mov [esp+8*ecx+4],eax
|
||||
jnz @B
|
||||
|
||||
SymCryptSha1AppendBlocksDone:
|
||||
;
|
||||
; Restore non-volatile regisers & stackpointer
|
||||
;
|
||||
mov ebp,[esp+SymCryptSha1AppendBlocksFrame.SaveEbp]
|
||||
mov edi,[esp+SymCryptSha1AppendBlocksFrame.SaveEdi]
|
||||
mov esi,[esp+SymCryptSha1AppendBlocksFrame.SaveEsi]
|
||||
mov ebx,[esp+SymCryptSha1AppendBlocksFrame.SaveEbx]
|
||||
add esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
|
||||
|
||||
ret 4
|
||||
|
||||
@SymCryptSha1AppendBlocksAsm@12 ENDP
|
||||
_TEXT ENDS
|
||||
|
||||
END
|
||||
|
|
@ -7,9 +7,7 @@
|
|||
|
||||
#include "precomp.h"
|
||||
|
||||
#define EQU =
|
||||
#include "C_asm_shared.inc"
|
||||
#undef EQU
|
||||
|
||||
#include "buildInfo.h"
|
||||
|
||||
|
@ -34,16 +32,16 @@ SymCryptLibraryWasNotInitialized()
|
|||
|
||||
#endif
|
||||
|
||||
const CHAR * SymCryptBuildString =
|
||||
"v" SYMCRYPT_BUILD_INFO_VERSION
|
||||
"_" SYMCRYPT_BUILD_INFO_BRANCH
|
||||
const CHAR * SymCryptBuildString =
|
||||
"v" SYMCRYPT_BUILD_INFO_VERSION
|
||||
"_" SYMCRYPT_BUILD_INFO_BRANCH
|
||||
"_" SYMCRYPT_BUILD_INFO_COMMIT
|
||||
"_" SYMCRYPT_BUILD_INFO_TIMESTAMP;
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptInitEnvCommon( UINT32 version )
|
||||
// Returns TRUE if the initializatoin steps have to be performed.
|
||||
// Returns TRUE if the initialization steps have to be performed.
|
||||
{
|
||||
UINT32 tmp;
|
||||
|
||||
|
|
|
@ -1,223 +0,0 @@
|
|||
//
|
||||
// asmstubs.c
|
||||
// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
#include "../precomp.h"
|
||||
|
||||
extern const SYMCRYPT_BLOCKCIPHER SymCryptAesBlockCipherNoOpt;
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptAesEncryptAsm(
|
||||
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
_In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc,
|
||||
_Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst )
|
||||
{
|
||||
SymCryptAesEncryptC( pExpandedKey, pbSrc, pbDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptAesDecryptAsm(
|
||||
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
_In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc,
|
||||
_Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst )
|
||||
{
|
||||
SymCryptAesDecryptC( pExpandedKey, pbSrc, pbDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptAesCbcEncryptAsm(
|
||||
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
_In_reads_( cbData ) PCBYTE pbSrc,
|
||||
_Out_writes_( cbData ) PBYTE pbDst,
|
||||
SIZE_T cbData )
|
||||
{
|
||||
SymCryptCbcEncrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptAesCbcDecryptAsm(
|
||||
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
_In_reads_( cbData ) PCBYTE pbSrc,
|
||||
_Out_writes_( cbData ) PBYTE pbDst,
|
||||
SIZE_T cbData )
|
||||
{
|
||||
SymCryptCbcDecrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptAesCtrMsb64Asm(
|
||||
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
_In_reads_( cbData ) PCBYTE pbSrc,
|
||||
_Out_writes_( cbData ) PBYTE pbDst,
|
||||
SIZE_T cbData )
|
||||
{
|
||||
SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
|
||||
SymCryptCtrMsb64( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
|
||||
{
|
||||
volatile BYTE * p = (volatile BYTE *) pbData;
|
||||
SIZE_T i;
|
||||
|
||||
for( i=0; i<cbData; i++ ){
|
||||
p[i] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMaskedCopyC(
|
||||
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
UINT32 nDigits,
|
||||
UINT32 mask );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMaskedCopyAsm(
|
||||
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
UINT32 nDigits,
|
||||
UINT32 mask )
|
||||
{
|
||||
SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
|
||||
}
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawAddC(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawAddAsm(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
|
||||
}
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSubC(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSubAsm(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulC(
|
||||
_In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulMulx(
|
||||
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulAsm(
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareC(
|
||||
_In_reads_(nWords) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareMulx(
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareAsm(
|
||||
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SymCryptFdefMontgomeryReduceC(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceMulx(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceAsm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
.SUFFIXES: .symcryptasm .cppasm
|
||||
|
||||
# We still have architecture-specific inference rules because otherwise we cannot do any architecture-specific preprocessing
|
||||
|
||||
# Preprocess amd64 .symcryptasm into masm
|
||||
{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
|
||||
..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
|
||||
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
|
||||
|
||||
# Preprocess x86 .cppasm into masm
|
||||
{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
|
||||
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
|
100
lib/sc_lib.h
100
lib/sc_lib.h
|
@ -2212,11 +2212,11 @@ SymCryptFdefModElementToIntGeneric(
|
|||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSetValue(
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_writes_(nWords) PUINT32 pDst,
|
||||
UINT32 nWords );
|
||||
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
|
||||
SIZE_T cbSrc,
|
||||
SYMCRYPT_NUMBER_FORMAT format,
|
||||
_Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2250,11 +2250,11 @@ SymCryptFdefModElementSetValueNegUint32(
|
|||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawGetValue(
|
||||
_In_reads_(nWords) PCUINT32 pSrc,
|
||||
UINT32 nWords,
|
||||
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format );
|
||||
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
|
||||
SIZE_T cbDst,
|
||||
SYMCRYPT_NUMBER_FORMAT format );
|
||||
|
||||
SYMCRYPT_ERROR
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2492,14 +2492,6 @@ SymCryptFdefRawSubUint32(
|
|||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMaskedAddUint32(
|
||||
_Inout_updates_( nWords ) PUINT32 pAcc,
|
||||
_In_reads_( nWords ) PCUINT32 pSrc,
|
||||
UINT32 mask,
|
||||
UINT32 nWords );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModMulGeneric(
|
||||
|
@ -2530,16 +2522,6 @@ SymCryptFdefModMulMontgomery256Asm(
|
|||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefModMulMontgomery256Test(
|
||||
_In_ PCSYMCRYPT_MODULUS pMod,
|
||||
_In_ PCSYMCRYPT_MODELEMENT pSrc1,
|
||||
_In_ PCSYMCRYPT_MODELEMENT pSrc2,
|
||||
_Out_ PSYMCRYPT_MODELEMENT pDst,
|
||||
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
|
||||
SIZE_T cbScratch );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdef369ModMulMontgomery(
|
||||
|
@ -2684,11 +2666,11 @@ SymCryptFdefRawMul(
|
|||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulMulx(
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2696,7 +2678,7 @@ SymCryptFdefRawMulMulx1024(
|
|||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2732,7 +2714,7 @@ UINT32
|
|||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawIsEqualUint32(
|
||||
_In_ PCUINT32 pSrc1,
|
||||
UINT32 nWords,
|
||||
UINT32 nDigits,
|
||||
_In_ UINT32 u32Src2 );
|
||||
|
||||
UINT32
|
||||
|
@ -2909,27 +2891,27 @@ SymCryptFdef369MaskedCopyAsm(
|
|||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulAsm(
|
||||
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareAsm(
|
||||
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdef369RawMulAsm(
|
||||
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2937,14 +2919,14 @@ SymCryptFdefRawMul512Asm(
|
|||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquare512Asm(
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
|
@ -2952,69 +2934,69 @@ SymCryptFdefRawMul1024Asm(
|
|||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquare1024Asm(
|
||||
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceAsm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduce256Asm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduce512Asm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduce1024Asm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdef369MontgomeryReduce(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdef369MontgomeryReduceAsm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceMulx(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceMulx1024(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Inout_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
// Helper macro for checking for specific key validation flag using bits 4 and 5 in a flags variable
|
||||
|
|
22
lib/sources
22
lib/sources
|
@ -13,14 +13,29 @@ ARM64X_EC_ENABLED=1
|
|||
TARGETNAME = symcrypt
|
||||
TARGETTYPE=LIBRARY
|
||||
|
||||
KM_LIBRARY = 1 # enable /kernel flag & epilogue metadata
|
||||
GUARD = 1 # enable CFG
|
||||
KM_LIBRARY = 1 # enable /kernel flag & epilogue metadata
|
||||
GUARD = 1 # enable CFG
|
||||
ENABLE_ASM_RETPOLINE = 1
|
||||
ENABLE_RETPOLINE_LINKER_WARNING = 1
|
||||
|
||||
# Enable /Gy for all assembler code
|
||||
ASM_DEFINES=$(ASM_DEFINES) /Gy
|
||||
|
||||
USE_MAKEFILE_INC = 1
|
||||
|
||||
# Explicitly call out that we must preprocess symcryptasm files
|
||||
# Make the target paths be architecture specific to get nmake to pick the right inference rule
|
||||
NTTARGETFILE0=\
|
||||
!IF "$(_BUILDARCH)" == "amd64"
|
||||
$(OBJ_PATH)\$(O)\..\amd64\fdef_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\amd64\wipe.asm \
|
||||
$(OBJ_PATH)\$(O)\..\amd64\aesasm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\amd64\fdef369_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\amd64\fdef_mulx.asm \
|
||||
!ELSEIF "$(_BUILDARCH)" == "x86"
|
||||
$(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
|
||||
!ENDIF
|
||||
|
||||
INCLUDES= \
|
||||
..\inc; \
|
||||
$(DS_INC_PATH)\crypto; \
|
||||
|
@ -137,7 +152,6 @@ SOURCES= \
|
|||
scsTools.c \
|
||||
|
||||
AMD64_SOURCES = \
|
||||
# sha1asm.asm \
|
||||
wipe.asm \
|
||||
aesasm.asm \
|
||||
fdef_asm.asm \
|
||||
|
@ -145,10 +159,8 @@ AMD64_SOURCES = \
|
|||
fdef_mulx.asm \
|
||||
|
||||
I386_SOURCES = \
|
||||
# sha1asm.asm \
|
||||
aesasm.asm \
|
||||
wipe.asm \
|
||||
# rc4asm.asm \
|
||||
fdef_asm.asm \
|
||||
|
||||
ARM_SOURCES = \
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
//
|
||||
// symcryptasm_shared.cppasm Shared definitions used by the C preprocessor step in symcryptasm
|
||||
// processing. See scripts/symcryptasm_processor.py for more details.
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
#if defined(SYMCRYPT_MASM)
|
||||
|
||||
#if defined(SYMCRYPT_CPU_AMD64)
|
||||
include ksamd64.inc
|
||||
#endif
|
||||
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
#define FILE_END() END
|
||||
#define ALIGN(__alignment) align __alignment
|
||||
#define GET_SYMBOL_ADDRESS(__symbol) __symbol
|
||||
#define HEX(__constant) __constant##h
|
||||
|
||||
#elif defined(SYMCRYPT_GAS)
|
||||
|
||||
.intel_syntax noprefix
|
||||
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
#define FILE_END()
|
||||
#define ALIGN(__alignment) .align __alignment
|
||||
#define GET_SYMBOL_ADDRESS(__symbol) __symbol+rip
|
||||
#define HEX(__constant) 0x##__constant
|
||||
|
||||
#else
|
||||
|
||||
#error Unknown target assembly
|
||||
|
||||
#endif
|
|
@ -0,0 +1,657 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
This script enables processing of symcryptasm files so that they can be assembled in a variety of
|
||||
environments without requiring forking or duplication of source files - symcryptasm files phrase
|
||||
assembly in an assembler and environment agnostic way.
|
||||
|
||||
The current target assemblers are:
|
||||
MASM and GAS
|
||||
The current target environments are:
|
||||
amd64 Windows (using the Microsoft x64 calling convention), and
|
||||
amd64 Linux (using the SystemV amd64 calling convention)
|
||||
|
||||
Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
|
||||
rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
|
||||
this effort.
|
||||
|
||||
The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
|
||||
symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
|
||||
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
|
||||
substitutions, outputting a .asm file which can be assembled by thetarget assembler for the target
|
||||
environment.
|
||||
|
||||
We have set up the intermediate generated files to be created in the output directories in both
|
||||
razzle and CMake builds.
|
||||
|
||||
### symcryptasm syntax ###
|
||||
|
||||
Different calling conventions pass arguments to functions in different registers, have differing
|
||||
numbers of volatile and non-volatile registers, and use the stack in different ways.
|
||||
|
||||
We define our own register naming scheme which abstracts away the differences between calling
|
||||
conventions. The generalities of the naming scheme will be similar across target architectures, but
|
||||
refer to the Architecture specifics below for details. For the following general information we use
|
||||
the notation R<n> to denote registers in the symcryptasm register naming scheme.
|
||||
|
||||
|
||||
A leaf function (a function which does not call another function) begins with an invocation of the
|
||||
FUNCTION_START macro which currently takes 3 arguments:
|
||||
1) The function name
|
||||
This must be the name that matches the corresponding declaration of the function
|
||||
2) The number of arguments (arg_count) that the function takes
|
||||
These arguments will be accessible in some contiguous region of the symcrypt registers at the
|
||||
start of the function
|
||||
On amd64 this contiguous region is R1..R<arg_count>
|
||||
Note: arg_count need not correspond to the exact number of argument in the function declaration
|
||||
if the assembly does not use some tail of the arguments
|
||||
3) The number of registers (reg_count) that the function uses
|
||||
These registers will be accessible as R0..R<reg_count-1>
|
||||
|
||||
A leaf function ends with the FUNCTION_END macro, which also takes the function name
|
||||
(a FUNCTION_END macro's function name must match the preceding FUNCTION_START's name)
|
||||
|
||||
At the function start a prologue is generated which arranges the arguments appropriately in
|
||||
registers, and saves non-volatile registers that have been requested to be used.
|
||||
At the function end an epilogue is generated with restores the non-volatile registers and returns.
|
||||
|
||||
|
||||
A nested function (a function which does call another function) is specified similarly, only using
|
||||
NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and align
|
||||
the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
|
||||
|
||||
|
||||
A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
|
||||
number of macros argument names. It ends with MACRO_END.
|
||||
|
||||
### Architecture specifics ###
|
||||
|
||||
### amd64 ###
|
||||
We allow up to 15 registers to be addressed, with the names:
|
||||
Q0-Q15 (64-bit registers), W0-W15 (32-bit registers), H0-H15 (16-bit registers), and B0-B15 (8-bit
|
||||
registers)
|
||||
Xmm0-Xmm5 registers may be used directly in assembly too, as in both amd64 calling conventions we
|
||||
currently support, these registers are volatile so do not need any special handling
|
||||
|
||||
On function entry we insert a prologue which ensures:
|
||||
Q0 is the result register (the return value of the function, and the low half of a multiplication)
|
||||
Q1-Q6 are the first 6 arguments passed to the function
|
||||
|
||||
Additionally, there is a special case for functions using mul or mulx instructions, as these
|
||||
instructions make rdx a special register. Functions using these instructions may address Q0-Q14,
|
||||
and QH. As rdx is used to pass arguments, its value is moved to another register in the function
|
||||
prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
|
||||
We currently do not support nested mul functions, as we have none of them.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
import types
|
||||
import logging
|
||||
|
||||
class Register:
|
||||
"""A class to represent registers"""
|
||||
|
||||
def __init__(self, name64, name32, name16, name8):
|
||||
self.name64 = name64
|
||||
self.name32 = name32
|
||||
self.name16 = name16
|
||||
self.name8 = name8
|
||||
|
||||
# amd64 registers
|
||||
REG_RAX = Register("rax", "eax", "ax", "al")
|
||||
REG_RBX = Register("rbx", "ebx", "bx", "bl")
|
||||
REG_RCX = Register("rcx", "ecx", "cx", "cl")
|
||||
REG_RDX = Register("rdx", "edx", "dx", "dl")
|
||||
REG_RSI = Register("rsi", "esi", "si", "sil")
|
||||
REG_RDI = Register("rdi", "edi", "di", "dil")
|
||||
REG_RSP = Register("rsp", "esp", "sp", "spl")
|
||||
REG_RBP = Register("rbp", "ebp", "bp", "bpl")
|
||||
REG_R8 = Register( "r8", "r8d", "r8w", "r8b")
|
||||
REG_R9 = Register( "r9", "r9d", "r9w", "r9b")
|
||||
REG_R10 = Register("r10", "r10d", "r10w", "r10b")
|
||||
REG_R11 = Register("r11", "r11d", "r11w", "r11b")
|
||||
REG_R12 = Register("r12", "r12d", "r12w", "r12b")
|
||||
REG_R13 = Register("r13", "r13d", "r13w", "r13b")
|
||||
REG_R14 = Register("r14", "r14d", "r14w", "r14b")
|
||||
REG_R15 = Register("r15", "r15d", "r15w", "r15b")
|
||||
|
||||
class CallingConvention:
|
||||
"""A class to represent calling conventions"""
|
||||
|
||||
def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
|
||||
self.name = name
|
||||
self.architecture = architecture
|
||||
self.mapping = mapping
|
||||
self.argument_registers = argument_registers
|
||||
self.volatile_registers = volatile_registers
|
||||
self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
|
||||
self.gen_epilogue_fn = types.MethodType(gen_epilogue_fn, self)
|
||||
self.gen_get_memslot_offset_fn = types.MethodType(gen_get_memslot_offset_fn, self)
|
||||
|
||||
|
||||
def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
|
||||
"""Gets the register mapping used in functions requiring special rdx handling.
|
||||
|
||||
In amd64, when using mul and mulx, rdx is a special register.
|
||||
rdx is also used for passing arguments in both Msft and System V calling conventions.
|
||||
In asm functions that use mul or mulx, we will explicitly move the argument passed in
|
||||
rdx to a different volatile register in the function prologue, and in the function body
|
||||
we refer to rdx using (Q|D|W|B)H.
|
||||
"""
|
||||
rdx_index = None
|
||||
return_mapping = { 'H': REG_RDX }
|
||||
for (index, register) in mapping.items():
|
||||
if register == REG_RDX:
|
||||
rdx_index = index
|
||||
break
|
||||
for (index, register) in mapping.items():
|
||||
# preserve argument registers
|
||||
if (index <= argument_registers) and (index != rdx_index):
|
||||
return_mapping[index] = register
|
||||
# replace rdx with the first non-argument register
|
||||
if index == argument_registers+1:
|
||||
return_mapping[rdx_index] = register
|
||||
# shuffle all later registers down to fill the gap
|
||||
if index > argument_registers+1:
|
||||
return_mapping[index-1] = register
|
||||
return return_mapping
|
||||
|
||||
# Calling convention constants
|
||||
|
||||
MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
|
||||
MAX_FUNCTION_REGISTER_COUNT = 15
|
||||
|
||||
# Microsoft x64 calling convention
|
||||
MAPPING_AMD64_MSFT = {
|
||||
0: REG_RAX, # Result register
|
||||
1: REG_RCX, # Argument 1 / volatile
|
||||
2: REG_RDX, # Argument 2 / volatile
|
||||
3: REG_R8, # Argument 3 / volatile
|
||||
4: REG_R9, # Argument 4 / volatile
|
||||
5: REG_R10, # volatile
|
||||
6: REG_R11, # volatile
|
||||
7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
|
||||
8: REG_RDI,
|
||||
9: REG_RBP,
|
||||
10:REG_RBX,
|
||||
11:REG_R12,
|
||||
12:REG_R13,
|
||||
13:REG_R14,
|
||||
14:REG_R15,
|
||||
# currently not mapping rsp
|
||||
}
|
||||
|
||||
def calc_amd64_shadow_space_allocation_size(self, reg_count):
|
||||
# If we are a nested function, we must allocate 32B of shadow space on the stack, and ensure the
|
||||
# stack pointer is aligned to 16B
|
||||
# Before the prologue we have rsp % 16 == 8 - as the call pushed an 8B return address on an
|
||||
# aligned stack
|
||||
alignment = 8
|
||||
# We then pushed some number of additional 8B registers onto the stack
|
||||
if reg_count > self.volatile_registers:
|
||||
alignment = (alignment + (8 * (self.volatile_registers - reg_count))) % 16
|
||||
shadow_space_allocation_size = 32
|
||||
if alignment == 8:
|
||||
# possibly allocate 8 more bytes to align the stack to 16B
|
||||
shadow_space_allocation_size += 8
|
||||
return shadow_space_allocation_size
|
||||
|
||||
def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=False):
|
||||
prologue = "\n"
|
||||
if reg_count > self.volatile_registers:
|
||||
prologue += "rex_push_reg Q%s\n" % self.volatile_registers
|
||||
for i in range(self.volatile_registers+1, reg_count):
|
||||
prologue += "push_reg Q%s\n" % i
|
||||
prologue += "\nEND_PROLOGUE\n\n"
|
||||
|
||||
shadow_space_allocation_size = 0
|
||||
|
||||
if nested:
|
||||
shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
|
||||
prologue += "sub rsp, %d // allocate shadow space and align stack\n\n" % shadow_space_allocation_size
|
||||
|
||||
prologue += mul_fixup
|
||||
|
||||
# put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
|
||||
# stack_offset to get the 5th argument is:
|
||||
# 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
|
||||
stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
|
||||
for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
|
||||
prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
|
||||
stack_offset += 8
|
||||
return prologue
|
||||
|
||||
def gen_prologue_amd64_msft_mul(self, arg_count, reg_count):
|
||||
return gen_prologue_amd64_msft(self, arg_count, reg_count, "mov Q2, QH\n")
|
||||
|
||||
def gen_prologue_amd64_msft_nested(self, arg_count, reg_count):
|
||||
return gen_prologue_amd64_msft(self, arg_count, reg_count, "", nested=True)
|
||||
|
||||
def gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=False):
|
||||
epilogue = ""
|
||||
|
||||
if nested:
|
||||
shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
|
||||
epilogue += "add rsp, %d // deallocate shadow space and align stack\n\n" % shadow_space_allocation_size
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
epilogue += "BEGIN_EPILOGUE\n"
|
||||
for i in reversed(range(self.volatile_registers, reg_count)):
|
||||
epilogue += "pop Q%s\n" % i
|
||||
epilogue += "ret\n"
|
||||
return epilogue
|
||||
|
||||
def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
|
||||
return gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=True)
|
||||
|
||||
def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
|
||||
# only support 4 memory slots for now (in shadow space)
|
||||
if(slot >= 4):
|
||||
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
exit(1)
|
||||
# 8B for return address + (8*#pushed registers in prologue)
|
||||
stack_offset = 8 + (8*(reg_count-self.volatile_registers))
|
||||
if nested:
|
||||
stack_offset += calc_amd64_shadow_space_allocation_size(self, reg_count)
|
||||
return "%d /*MEMSLOT%d*/" % (stack_offset+(8*slot), slot)
|
||||
|
||||
def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
|
||||
return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
|
||||
|
||||
CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
|
||||
gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
|
||||
CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
|
||||
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
|
||||
gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
|
||||
CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
|
||||
gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
|
||||
|
||||
# AMD64 System V calling convention
|
||||
MAPPING_AMD64_SYSTEMV = {
|
||||
0: REG_RAX, # Result register
|
||||
1: REG_RDI, # Argument 1 / volatile
|
||||
2: REG_RSI, # Argument 2 / volatile
|
||||
3: REG_RDX, # Argument 3 / volatile
|
||||
4: REG_RCX, # Argument 4 / volatile
|
||||
5: REG_R8, # Argument 5 / volatile
|
||||
6: REG_R9, # Argument 6 / volatile
|
||||
7: REG_R10, # volatile
|
||||
8: REG_R11, # volatile
|
||||
9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
|
||||
10:REG_RBP,
|
||||
11:REG_R12,
|
||||
12:REG_R13,
|
||||
13:REG_R14,
|
||||
14:REG_R15
|
||||
# currently not mapping rsp
|
||||
}
|
||||
|
||||
def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=False):
|
||||
# push volatile registers onto the stack
|
||||
prologue = "\n"
|
||||
if reg_count > self.volatile_registers:
|
||||
for i in range(self.volatile_registers, reg_count):
|
||||
prologue += "push Q%s\n" % i
|
||||
|
||||
# If we are a nested function, we need to align the stack to 16B, and allocate space for up to 4
|
||||
# memory slots not in the redzone. We can use the same logic as on the MSFT x64 side to allocate
|
||||
# our own space for 32B of local variables (whereas on the MSFT side, we use this for allocating
|
||||
# space for a function we are about to call)
|
||||
if nested:
|
||||
allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
|
||||
prologue += "sub rsp, %d // allocate memslot space and align stack\n\n" % allocation_size
|
||||
|
||||
prologue += mul_fixup
|
||||
|
||||
# do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
|
||||
# # put additional arguments into Q7-Qn
|
||||
# # stack_offset to get the 7th argument is:
|
||||
# # 8B for return address
|
||||
# stack_offset = 8
|
||||
# for i in range(self.argument_registers+1, arg_count+1):
|
||||
# prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
|
||||
# stack_offset += 8
|
||||
|
||||
return prologue
|
||||
|
||||
def gen_prologue_amd64_systemv_mul(self, arg_count, reg_count):
|
||||
return gen_prologue_amd64_systemv(self, arg_count, reg_count, "mov Q3, QH\n")
|
||||
|
||||
def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count):
|
||||
return gen_prologue_amd64_systemv(self, arg_count, reg_count, "", nested=True)
|
||||
|
||||
def gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=False):
|
||||
epilogue = ""
|
||||
|
||||
if nested:
|
||||
allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
|
||||
epilogue += "add rsp, %d // deallocate memslot space and align stack\n\n" % allocation_size
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
for i in reversed(range(self.volatile_registers, reg_count)):
|
||||
epilogue += "pop Q%s\n" % i
|
||||
epilogue += "ret\n"
|
||||
return epilogue
|
||||
|
||||
def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
|
||||
return gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=True)
|
||||
|
||||
def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
|
||||
# only support 4 memory slots for now
|
||||
if(slot >= 4):
|
||||
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
exit(1)
|
||||
# For leaf functions, use the top of the redzone below the stack pointer
|
||||
offset = -8 * (slot+1)
|
||||
if nested:
|
||||
# For nested functions, use the 32B of memslot space above the stack pointer created in the prologue
|
||||
offset = 8*slot
|
||||
return "%d /*MEMSLOT%d*/" % (offset, slot)
|
||||
|
||||
def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count):
|
||||
return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
|
||||
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
|
||||
gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
|
||||
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
|
||||
gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
|
||||
gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
|
||||
|
||||
|
||||
def gen_function_start_defines(mapping, arg_count, reg_count):
|
||||
defines = ""
|
||||
for (index, reg) in mapping.items():
|
||||
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
|
||||
continue
|
||||
defines += "#define Q%s %s\n" % (index, reg.name64)
|
||||
defines += "#define D%s %s\n" % (index, reg.name32)
|
||||
defines += "#define W%s %s\n" % (index, reg.name16)
|
||||
defines += "#define B%s %s\n" % (index, reg.name8)
|
||||
return defines
|
||||
|
||||
def gen_function_end_defines(mapping, arg_count, reg_count):
|
||||
undefs = ""
|
||||
for (index, _) in mapping.items():
|
||||
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
|
||||
continue
|
||||
undefs += "#undef Q%s\n" % (index)
|
||||
undefs += "#undef D%s\n" % (index)
|
||||
undefs += "#undef W%s\n" % (index)
|
||||
undefs += "#undef B%s\n" % (index)
|
||||
return undefs
|
||||
|
||||
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n"
|
||||
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n"
|
||||
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n"
|
||||
MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n"
|
||||
|
||||
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
|
||||
GAS_FUNCTION_END = ""
|
||||
|
||||
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
|
||||
function_entry = None
|
||||
if assembler == "masm":
|
||||
# need to identify and mark up frame functions in masm
|
||||
if nested or (reg_count > calling_convention.volatile_registers):
|
||||
function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
|
||||
else:
|
||||
function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
|
||||
elif assembler == "gas":
|
||||
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
|
||||
|
||||
prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
|
||||
prologue += "%s" % (function_entry)
|
||||
prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
|
||||
|
||||
return prologue
|
||||
|
||||
def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
|
||||
function_end = None
|
||||
if assembler == "masm":
|
||||
# need to identify and mark up frame functions in masm
|
||||
if nested or (reg_count > calling_convention.volatile_registers):
|
||||
function_end = MASM_FRAME_FUNCTION_END % (function_name)
|
||||
else:
|
||||
function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
|
||||
elif assembler == "gas":
|
||||
function_end = GAS_FUNCTION_END
|
||||
|
||||
epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
|
||||
epilogue += "%s" % (function_end)
|
||||
epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
|
||||
|
||||
return epilogue
|
||||
|
||||
MASM_MACRO_START = "%s MACRO %s\n"
|
||||
MASM_MACRO_END = "ENDM\n"
|
||||
GAS_MACRO_START = ".macro %s %s\n"
|
||||
GAS_MACRO_END = ".endm\n"
|
||||
MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
|
||||
GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
|
||||
|
||||
|
||||
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
|
||||
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
|
||||
GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
|
||||
ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
|
||||
MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
|
||||
MACRO_END_PATTERN = re.compile("\s*MACRO_END\s*\(\s*\)")
|
||||
|
||||
class ProcessingStateMachine:
|
||||
"""A class to hold the state when processing a file and handle files line by line"""
|
||||
|
||||
def __init__(self, assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention):
|
||||
self.assembler = assembler
|
||||
self.normal_calling_convention = normal_calling_convention
|
||||
self.mul_calling_convention = mul_calling_convention
|
||||
self.nested_calling_convention = nested_calling_convention
|
||||
|
||||
self.function_start_match = None
|
||||
self.function_start_line = 0
|
||||
self.is_nested_function = None
|
||||
self.is_mul_function = None
|
||||
self.calling_convention = None
|
||||
self.function_name = None
|
||||
self.arg_count = None
|
||||
self.reg_count = None
|
||||
|
||||
self.macro_start_match = None
|
||||
self.macro_name = None
|
||||
self.macro_args = None
|
||||
|
||||
def process_line(self, line, line_num):
|
||||
if self.function_start_match == None and self.macro_start_match == None:
|
||||
return self.process_normal_line(line, line_num)
|
||||
elif self.function_start_match != None:
|
||||
return self.process_function_line(line, line_num)
|
||||
elif self.macro_start_match != None:
|
||||
return self.process_macro_line(line, line_num)
|
||||
else:
|
||||
logging.error("Whoops, something is broken with the state machine (failed at line %d)" % line_num)
|
||||
exit(1)
|
||||
|
||||
def process_normal_line(self, line, line_num):
|
||||
# Not currently in a function or macro
|
||||
match = FUNCTION_START_PATTERN.match(line)
|
||||
if (match):
|
||||
return self.process_start_function(match, line, line_num)
|
||||
|
||||
match = MACRO_START_PATTERN.match(line)
|
||||
if (match):
|
||||
return self.process_start_macro(match, line, line_num)
|
||||
|
||||
# Not starting a function or a macro
|
||||
return line
|
||||
|
||||
def process_start_function(self, match, line, line_num):
|
||||
# Entering a new function
|
||||
self.function_start_match = match
|
||||
self.function_start_line = line_num
|
||||
self.is_nested_function = (match.group(1) == "NESTED_")
|
||||
self.is_mul_function = (match.group(2) == "MUL_")
|
||||
self.function_name = match.groups()[-3]
|
||||
self.arg_count = int(match.groups()[-2])
|
||||
self.reg_count = int(match.groups()[-1])
|
||||
|
||||
if self.is_nested_function and self.is_mul_function:
|
||||
logging.error(
|
||||
"Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
|
||||
"%s (line %d)"
|
||||
% (line, line_num))
|
||||
exit(1)
|
||||
if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
|
||||
logging.error(
|
||||
"Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
|
||||
exit(1)
|
||||
if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
|
||||
logging.error(
|
||||
"Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
|
||||
exit(1)
|
||||
if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
|
||||
logging.error(
|
||||
"Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
|
||||
exit(1)
|
||||
|
||||
logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
|
||||
|
||||
if self.is_nested_function:
|
||||
self.calling_convention = self.nested_calling_convention
|
||||
elif self.is_mul_function:
|
||||
self.calling_convention = self.mul_calling_convention
|
||||
else:
|
||||
self.calling_convention = self.normal_calling_convention
|
||||
|
||||
return generate_prologue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
|
||||
|
||||
def process_start_macro(self, match, line, line_num):
|
||||
self.macro_start_match = match
|
||||
self.macro_name = match.group(1)
|
||||
self.macro_args = [ x.strip() for x in match.group(2).split(",") ]
|
||||
|
||||
logging.info("%d: macro start %s, %s" % (line_num, self.macro_name, self.macro_args))
|
||||
|
||||
if self.assembler == "masm":
|
||||
return MASM_MACRO_START % (self.macro_name, match.group(2))
|
||||
elif self.assembler == "gas":
|
||||
return GAS_MACRO_START % (self.macro_name, match.group(2))
|
||||
|
||||
def process_function_line(self, line, line_num):
|
||||
# Currently in a function
|
||||
|
||||
match = ALTERNATE_ENTRY_PATTERN.match(line)
|
||||
if (match):
|
||||
if self.assembler == "masm":
|
||||
return MASM_ALTERNATE_ENTRY % match.group(1)
|
||||
elif self.assembler == "gas":
|
||||
return GAS_ALTERNATE_ENTRY % (match.group(1), match.group(1))
|
||||
|
||||
match = FUNCTION_END_PATTERN.match(line)
|
||||
if (match):
|
||||
# Check the end function has same prefix as previous start function
|
||||
if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
|
||||
(self.is_mul_function ^ (match.group(2) == "MUL_")):
|
||||
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
|
||||
% (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
|
||||
exit(1)
|
||||
# Check the end function pattern has the same label as the previous start function pattern
|
||||
if self.function_name != match.groups()[-1]:
|
||||
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
|
||||
% (self.function_name, self.function_start_line, match.groups()[-1], line_num))
|
||||
exit(1)
|
||||
|
||||
epilogue = generate_epilogue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
|
||||
|
||||
logging.info("%d: function end %s" % (line_num, self.function_name))
|
||||
|
||||
self.function_start_match = None
|
||||
self.function_start_line = 0
|
||||
self.is_nested_function = None
|
||||
self.is_mul_function = None
|
||||
self.calling_convention = None
|
||||
self.function_name = None
|
||||
self.arg_count = None
|
||||
self.reg_count = None
|
||||
|
||||
return epilogue
|
||||
|
||||
# replace any GET_MEMSLOT_OFFSET macros in line
|
||||
match = GET_MEMSLOT_PATTERN.search(line)
|
||||
while(match):
|
||||
slot = int(match.group(1))
|
||||
replacement = self.calling_convention.gen_get_memslot_offset_fn(slot, self.arg_count, self.reg_count)
|
||||
line = GET_MEMSLOT_PATTERN.sub(replacement, line)
|
||||
match = GET_MEMSLOT_PATTERN.search(line)
|
||||
|
||||
logging.info("%d: memslot macro %d" % (line_num, slot))
|
||||
|
||||
# Not modifying the line any further
|
||||
return line
|
||||
|
||||
def process_macro_line(self, line, line_num):
|
||||
# Currently in a macro
|
||||
match = MACRO_END_PATTERN.match(line)
|
||||
if (match):
|
||||
logging.info("%d: macro end %s" % (line_num, self.macro_name))
|
||||
|
||||
self.macro_start_match = None
|
||||
self.macro_name = None
|
||||
self.macro_args = None
|
||||
|
||||
if self.assembler == "masm":
|
||||
return MASM_MACRO_END
|
||||
elif self.assembler == "gas":
|
||||
return GAS_MACRO_END
|
||||
|
||||
if self.assembler == "gas":
|
||||
# In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
|
||||
for arg in self.macro_args:
|
||||
line = re.sub(arg, r"\\%s" % arg, line)
|
||||
|
||||
# Not modifying the line any further
|
||||
return line
|
||||
|
||||
def process_file(target, infilename, outfilename):
|
||||
assembler = None
|
||||
if target == "masm":
|
||||
assembler = "masm"
|
||||
normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
|
||||
mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
|
||||
nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
|
||||
elif target == "gas":
|
||||
assembler = "gas"
|
||||
normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
|
||||
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
|
||||
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
|
||||
|
||||
# iterate through file line by line in one pass
|
||||
file_processing_state = ProcessingStateMachine(
|
||||
assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention)
|
||||
|
||||
with open(infilename) as infile:
|
||||
with open(outfilename, "w") as outfile:
|
||||
for line_num, line in enumerate(infile):
|
||||
processed_line = file_processing_state.process_line(line, line_num)
|
||||
outfile.write(processed_line)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
|
||||
parser.add_argument('target', type=str, help='Target that we want to preprocess for')
|
||||
parser.add_argument('inputfile', type=str, help='Path to input file')
|
||||
parser.add_argument('outputfile', type=str, help='Path to output file')
|
||||
|
||||
args = parser.parse_args()
|
||||
process_file(args.target, args.inputfile, args.outputfile)
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
#include "precomp.h"
|
||||
|
||||
#define EQU =
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
VOID
|
||||
|
|
Загрузка…
Ссылка в новой задаче