Merged PR 5854070: Introduce symcryptasm format to enable use of asm in Windows and Linux

+ Introduce a 2 stage pre-processing setup to convert .symcryptasm to either masm (msft x64
calling convention) or gas (SystemV amd64 calling convention)
  + Step 1 converts .symcryptasm to .cppasm (using `lib\symcryptasm_processor.py`)
  + Step 2 converts .cppasm  to .asm using the C preprocessor
+ Updated CMakeLists.txt to invoke this preprocesssing when any relevant files is updated
+ Also introduced makefile.inc for the razzle build
+ I have translated all of the amd64 asm files we want to preserve, and the performance for big
integer reliant code is the same on Windows and Linux (and a bit better on Windows than before :))
+ In translation I did some tidying of the underlying assembly:
  + Removing needless work (some size specific functions in particular had cruft from their
adaptation from the generic sized versions)
  + Reducing code size (i.e. by using inc/dec rather than add/sub 1)
  + Some micro-optimizations to remove needless instruction dependencies

Related work items: #30621935
This commit is contained in:
Samuel Lee 2021-04-23 11:33:23 +00:00
Родитель 27765f9929
Коммит 77d1e446e4
36 изменённых файлов: 6038 добавлений и 8131 удалений

Просмотреть файл

@ -28,9 +28,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/${CMAKE_SYSTEM_PROCES
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/exe/${CMAKE_SYSTEM_PROCESSOR}/${SYMCRYPT_TARGET_ENV})
if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
# Set DBG=1 and enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in
# the toolchain file
add_compile_options(-DDBG=1)
# Enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in the
# toolchain file
enable_language(ASM_MASM)
add_compile_options(/MP)
# Remove /RTC1, incompatible of /Ox
@ -43,16 +42,23 @@ if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
string( REPLACE "/Od" "" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
string( REPLACE "/Od" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
string( REPLACE "/Od" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
IF(CMAKE_BUILD_TYPE MATCHES Release)
message("Release mode")
if(CMAKE_BUILD_TYPE MATCHES Release)
add_compile_options(/Oxs)
ENDIF(CMAKE_BUILD_TYPE MATCHES Release)
endif()
elseif(NOT WIN32)
enable_language(ASM)
add_compile_options(-Wno-deprecated-declarations -Wno-deprecated)
add_compile_options(-g)
add_compile_options(-Wno-multichar)
add_compile_options(-fPIC)
endif()
if(CMAKE_BUILD_TYPE MATCHES Release)
message("Release mode")
else()
message("Debug mode")
add_compile_options(-DDBG=1)
endif()
include_directories(inc)

Просмотреть файл

@ -1,4 +1,4 @@
# Introduction
# Introduction
SymCrypt is the core cryptographic function library currently used by Windows.
## History
@ -30,20 +30,23 @@ or gcc 7.4.0 on Linux. Note that CMake ships with Visual Studio 2019.
4. Configure CMake compilation:
* For 32-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-x86.cmake -A Win32`
* For 64-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-amd64.cmake`
* For Linux (or Windows with no CPU optimizations): `cmake ..`
* For 64-bit Linux targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/linux-amd64.cmake`
* For no CPU optimizations: `cmake ..`
* Optionally, for a release build, specify `-DCMAKE_BUILD_TYPE=Release`
5. `cmake --build .`
* Optionally specify -jN where N is the number of processes you wish to spawn for the build
If compilation succeeds, the output will be put in the `exe` subdirectory relative to where compilation occurred
(i.e. `bin/exe` if you followed the instructions above).
The SymCrypt unit test is in the `unittest` directory. It runs extensive functional tests on the SymCrypt
library. On Windows it also compares results against on other implementations such as the Windows APIs CNG
and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
detailed performance information.
# Security Bugs
If you believe you have found a problem that affects the security of this code, please do **NOT** create an issue
or pull request, but instead email your comments to secure@microsoft.com.
or pull request, but instead email your comments to secure@microsoft.com.
# Contribute
We love to receive comments and suggestions. Unfortunately we cannot accept external code contributions at this time.

Просмотреть файл

@ -10,7 +10,6 @@ set(SYMCRYPT_TARGET_ENV Linux)
# Define _AMD64_ to set up the correct SymCrypt macros, e.g. SYMCRYPT_CPU_AMD64
add_compile_options(-D_AMD64_)
add_compile_options(-DDBG)
add_compile_options(-O3)
# Enable a baseline of features for the compiler to support everywhere

Просмотреть файл

@ -1,70 +1,43 @@
;/*
; C_asm_shared.inc file to synchronize C and Asm information
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
/*
C_asm_shared.inc file to synchronize C and Asm information
Copyright (c) Microsoft Corporation. Licensed under the MIT license.
; This is a file that compiles both in C and ASM to define values in a way that is guaranteed to be the same on both sides.
; We use this to define the structure offsets that the ASM code uses.
; By having equivalent C constants we can add checks to the C code to ensure they are correct.
;
; This is an ugly hack, but it works :-)
;
; Due to the fact that the ARM assemblers use the C precompiler
; the C files have to redefine EQU to nothing before including this file.
; */
This is a file that is included in both C and ASM such that the values are the same on both sides.
We use the C preprocessor to set ASM constants, as we already need to use the C preprocessor for
symcryptasm processing (see scripts/symcryptasm_processor.py).
We use this to define the structure offsets that the ASM code uses.
By having equivalent C constants we can add checks to the C code to ensure they are correct.
*/
;const SIZE_T
SymCryptModulusNdigitsOffsetAmd64 EQU 4;
#if defined(SYMCRYPT_MASM)
#define SET(_variable, _value) _variable EQU _value
#elif defined(SYMCRYPT_GAS)
#define SET(_variable, _value) .set _variable, _value
#else // assume C
#define SET(_variable, _value) const SIZE_T _variable = _value;
#endif
; const SIZE_T
SymCryptModulusMontgomeryInv64OffsetAmd64 EQU 32;
SET(SymCryptModulusNdigitsOffsetAmd64, 4);
SET(SymCryptModulusMontgomeryInv64OffsetAmd64, 32);
SET(SymCryptModulusValueOffsetAmd64, 128);
; const SIZE_T
SymCryptModulusValueOffsetAmd64 EQU 128;
SET(SymCryptModulusNdigitsOffsetX86, 4);
SET(SymCryptModulusMontgomeryInv64OffsetX86, 24);
SET(SymCryptModulusValueOffsetX86, 96);
SET(SymCryptModulusNdigitsOffsetArm64, 4);
SET(SymCryptModulusMontgomeryInv64OffsetArm64, 32);
SET(SymCryptModulusValueOffsetArm64, 128);
SET(SymCryptModulusNdigitsOffsetArm, 4);
SET(SymCryptModulusMontgomeryInv64OffsetArm, 24);
SET(SymCryptModulusValueOffsetArm, 96);
;const SIZE_T
SymCryptModulusNdigitsOffsetX86 EQU 4;
; const SIZE_T
SymCryptModulusMontgomeryInv64OffsetX86 EQU 24;
; const SIZE_T
SymCryptModulusValueOffsetX86 EQU 96;
;const SIZE_T
SymCryptModulusNdigitsOffsetArm64 EQU 4;
; const SIZE_T
SymCryptModulusMontgomeryInv64OffsetArm64 EQU 32;
; const SIZE_T
SymCryptModulusValueOffsetArm64 EQU 128;
;const SIZE_T
SymCryptModulusNdigitsOffsetArm EQU 4;
; const SIZE_T
SymCryptModulusMontgomeryInv64OffsetArm EQU 24;
; const SIZE_T
SymCryptModulusValueOffsetArm EQU 96;
; /*
IF 0
; */
#undef EQU
#if !defined(SYMCRYPT_MASM) && !defined(SYMCRYPT_GAS)
// Preserve the definition of SET for use in symcryptasm processing
#undef SET
#endif
#if SYMCRYPT_CPU_AMD64
#define SYMCRYPT_CHECK_ASM_OFFSETS \
@ -89,14 +62,9 @@ SymCryptModulusValueOffsetArm EQU 96;
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusNdigitsOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, nDigits ) );\
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusMontgomeryInv64OffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, tm.montgomery.inv64 ));\
SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusValueOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, Divisor.Int.ti.fdef.uint32 ));\
#endif // CPU_*
#if !defined( SYMCRYPT_CHECK_ASM_OFFSETS)
#define SYMCRYPT_CHECK_ASM_OFFSETS
#endif
; /*
ENDIF
; */

Просмотреть файл

@ -96,42 +96,148 @@ set(SOURCES_COMMON
IEEE802_11SaeCustom.c
)
function(process_cppasm filepath outformat archdefine)
get_filename_component(fileextension ${filepath} EXT)
if(NOT fileextension STREQUAL .cppasm)
message(FATAL_ERROR "cppasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
endif()
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
endif()
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
string(TOUPPER ${outformat} outformatupper)
string(TOUPPER ${archdefine} archdefineupper)
string(FIND ${rootpath} ${CMAKE_CURRENT_BINARY_DIR} findindex) # check whether input is in the output directory
if(findindex EQUAL -1) # input in the source directory
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
set(output_pass2 ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}/${filestem}-${outformat}.asm)
else() # input in the output directory
set(output_directory ${rootpath})
set(output_pass2 ${rootpath}/${filestem}.asm)
endif()
set(dbg_definition "")
if(CMAKE_BUILD_TYPE MATCHES Debug)
set(dbg_definition "-DDBG=1")
endif()
if(outformat STREQUAL gas)
# assume gas => GCC compatible C compiler
add_custom_command(
OUTPUT ${output_pass2}
COMMAND "${CMAKE_C_COMPILER}" -E -P -x c ${filepath} -o ${output_pass2}
-I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
-DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
MAIN_DEPENDENCY ${filepath}
DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
VERBATIM)
elseif(outformat STREQUAL masm)
# assume masm => MSVC C compiler
add_custom_command(
OUTPUT ${output_pass2}
COMMAND "${CMAKE_C_COMPILER}" /EP /P /Fi${output_pass2} ${filepath}
-I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
-DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
MAIN_DEPENDENCY ${filepath}
DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
VERBATIM)
endif()
endfunction()
function(process_symcryptasm filepath outformat archdefine)
get_filename_component(fileextension ${filepath} EXT)
if(NOT fileextension STREQUAL .symcryptasm)
message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
endif()
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
set(output_directory ${CMAKE_CURRENT_BINARY_DIR}/${rootpath})
set(output_cppasm ${output_directory}/${filestem}-${outformat}.cppasm)
add_custom_command(
OUTPUT ${output_cppasm}
COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
MAIN_DEPENDENCY ${filepath}
DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
VERBATIM)
process_cppasm(${output_cppasm} ${outformat} ${archdefine})
endfunction()
if(NOT WIN32)
list(APPEND SOURCES_COMMON linux/intrinsics.c)
list(APPEND SOURCES_COMMON linux/asmstubs.c)
endif()
if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
list(APPEND SOURCES_COMMON
amd64/aesasm.asm
amd64/fdef_asm.asm
amd64/fdef_mulx.asm
amd64/fdef369_asm.asm
amd64/sha1asm.asm
amd64/wipe.asm)
amd64/aesasm-masm.asm
amd64/fdef_asm-masm.asm
amd64/fdef369_asm-masm.asm
amd64/fdef_mulx-masm.asm
amd64/wipe-masm.asm)
set_source_files_properties(
amd64/aesasm.asm
amd64/fdef_asm.asm
amd64/fdef_mulx.asm
amd64/fdef369_asm.asm
amd64/sha1asm.asm
amd64/wipe.asm
amd64/aesasm-masm.asm
amd64/fdef_asm-masm.asm
amd64/fdef369_asm-masm.asm
amd64/fdef_mulx-masm.asm
amd64/wipe-masm.asm
PROPERTY LANGUAGE ASM_MASM)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "X86")
process_cppasm(i386/fdef_asm.cppasm masm x86)
list(APPEND SOURCES_COMMON
i386/aesasm.asm
i386/fdef_asm.asm
i386/rc4asm.asm
i386/sha1asm.asm
i386/fdef_asm-masm.asm
i386/wipe.asm)
set_source_files_properties(
i386/aesasm.asm
i386/fdef_asm.asm
i386/rc4asm.asm
i386/sha1asm.asm
i386/fdef_asm-masm.asm
i386/wipe.asm
PROPERTY LANGUAGE ASM_MASM)
set_source_files_properties(
i386/fdef_asm-masm.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/i386)
endif()
else()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
list(APPEND SOURCES_COMMON
amd64/aesasm-gas.asm
amd64/fdef_asm-gas.asm
amd64/fdef369_asm-gas.asm
amd64/fdef_mulx-gas.asm
amd64/wipe-gas.asm)
set_source_files_properties(
amd64/aesasm-gas.asm
amd64/fdef_asm-gas.asm
amd64/fdef369_asm-gas.asm
amd64/fdef_mulx-gas.asm
amd64/wipe-gas.asm
PROPERTY LANGUAGE ASM)
endif()
endif()

Просмотреть файл

@ -22,7 +22,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
SYMCRYPT_MOD_FUNCTIONS_FDEF_GENERIC, // Handles any type of modulus
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY, // Montgomery, only for odd parity-public moduli
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
#if SYMCRYPT_CPU_AMD64
SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY, // optimized for 384 and 576-bit moduli
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY256, // Special faster code for 256-bit Montgomery moduli
@ -55,12 +55,12 @@ const UINT32 g_SymCryptModFnsMask = sizeof( g_SymCryptModFns ) - sizeof( g_SymCr
//
// Tweaking the selection & function tables allows different tradeoffs of performance vs codesize
//
SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
{
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
#if SYMCRYPT_CPU_AMD64
// Mulx used for 257-512 and 577-... bits
{('2M' << 16) + SymCryptModFntableMontgomery256, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('5M' << 16) + SymCryptModFntableMontgomery512, 0, 512, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 576, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
@ -118,9 +118,9 @@ SymCryptSizeofIntFromDigits( UINT32 nDigits )
PSYMCRYPT_INT
SYMCRYPT_CALL
SymCryptIntCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
SymCryptIntCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
UINT32 nDigits )
{
return SymCryptFdefIntCreate( pbBuffer, cbBuffer, nDigits );
@ -138,8 +138,8 @@ SymCryptIntWipe( _Out_ PSYMCRYPT_INT piDst )
VOID
SYMCRYPT_CALL
SymCryptIntCopy(
_In_ PCSYMCRYPT_INT piSrc,
SymCryptIntCopy(
_In_ PCSYMCRYPT_INT piSrc,
_Out_ PSYMCRYPT_INT piDst )
{
SymCryptFdefIntCopy( piSrc, piDst );
@ -191,8 +191,8 @@ SymCryptIntDigitsizeOfObject( _In_ PCSYMCRYPT_INT piSrc )
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptIntCopyMixedSize(
_In_ PCSYMCRYPT_INT piSrc,
SymCryptIntCopyMixedSize(
_In_ PCSYMCRYPT_INT piSrc,
_Out_ PSYMCRYPT_INT piDst )
{
return SymCryptFdefIntCopyMixedSize( piSrc, piDst );
@ -207,8 +207,8 @@ SymCryptIntBitsizeOfValue( _In_ PCSYMCRYPT_INT piSrc )
VOID
SYMCRYPT_CALL
SymCryptIntSetValueUint32(
UINT32 u32Src,
SymCryptIntSetValueUint32(
UINT32 u32Src,
_Out_ PSYMCRYPT_INT piDst )
{
SymCryptFdefIntSetValueUint32( u32Src, piDst );
@ -216,8 +216,8 @@ SymCryptIntSetValueUint32(
VOID
SYMCRYPT_CALL
SymCryptIntSetValueUint64(
UINT64 u64Src,
SymCryptIntSetValueUint64(
UINT64 u64Src,
_Out_ PSYMCRYPT_INT piDst )
{
SymCryptFdefIntSetValueUint64( u64Src, piDst );
@ -225,10 +225,10 @@ SymCryptIntSetValueUint64(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptIntSetValue(
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
SymCryptIntSetValue(
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
_Out_ PSYMCRYPT_INT piDst )
{
return SymCryptFdefIntSetValue( pbSrc, cbSrc, format, piDst );
@ -236,10 +236,10 @@ SymCryptIntSetValue(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptIntGetValue(
_In_ PCSYMCRYPT_INT piSrc,
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
SIZE_T cbDst,
SymCryptIntGetValue(
_In_ PCSYMCRYPT_INT piSrc,
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format )
{
return SymCryptFdefIntGetValue( piSrc, pbDst, cbDst, format );
@ -496,9 +496,9 @@ SymCryptSizeofDivisorFromDigits( UINT32 nDigits )
PSYMCRYPT_DIVISOR
SYMCRYPT_CALL
SymCryptDivisorCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
SymCryptDivisorCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
UINT32 nDigits )
{
return SymCryptFdefDivisorCreate( pbBuffer, cbBuffer, nDigits );
@ -514,8 +514,8 @@ SymCryptDivisorWipe( _Out_ PSYMCRYPT_DIVISOR pdObj )
}
VOID
SymCryptDivisorCopy(
_In_ PCSYMCRYPT_DIVISOR pdSrc,
SymCryptDivisorCopy(
_In_ PCSYMCRYPT_DIVISOR pdSrc,
_Out_ PSYMCRYPT_DIVISOR pdDst )
{
SymCryptFdefDivisorCopy( pdSrc, pdDst );
@ -585,9 +585,9 @@ SymCryptSizeofModulusFromDigits( UINT32 nDigits )
PSYMCRYPT_MODULUS
SYMCRYPT_CALL
SymCryptModulusCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
SymCryptModulusCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
UINT32 nDigits )
{
return SymCryptFdefModulusCreate( pbBuffer, cbBuffer, nDigits );
@ -604,7 +604,7 @@ SymCryptModulusWipe( _Out_ PSYMCRYPT_MODULUS pmObj )
VOID
SymCryptModulusCopy(
_In_ PCSYMCRYPT_MODULUS pmSrc,
_In_ PCSYMCRYPT_MODULUS pmSrc,
_Out_ PSYMCRYPT_MODULUS pmDst )
{
SymCryptFdefModulusCopy( pmSrc, pmDst );
@ -626,8 +626,8 @@ SymCryptModElementAllocate( _In_ PCSYMCRYPT_MODULUS pmMod )
VOID
SYMCRYPT_CALL
SymCryptModElementFree(
_In_ PCSYMCRYPT_MODULUS pmMod,
SymCryptModElementFree(
_In_ PCSYMCRYPT_MODULUS pmMod,
_Out_ PSYMCRYPT_MODELEMENT peObj )
{
SymCryptFdefModElementFree( pmMod, peObj );
@ -642,9 +642,9 @@ SymCryptSizeofModElementFromModulus( PCSYMCRYPT_MODULUS pmMod )
PSYMCRYPT_MODELEMENT
SYMCRYPT_CALL
SymCryptModElementCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
SymCryptModElementCreate(
_Out_writes_bytes_( cbBuffer ) PBYTE pbBuffer,
SIZE_T cbBuffer,
PCSYMCRYPT_MODULUS pmMod )
{
return SymCryptFdefModElementCreate( pbBuffer, cbBuffer, pmMod );
@ -660,9 +660,9 @@ SymCryptModElementWipe(
}
VOID
SymCryptModElementCopy(
SymCryptModElementCopy(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PCSYMCRYPT_MODELEMENT peSrc,
_In_ PCSYMCRYPT_MODELEMENT peSrc,
_Out_ PSYMCRYPT_MODELEMENT peDst )
{
SymCryptFdefModElementCopy( pmMod, peSrc, peDst );
@ -671,7 +671,7 @@ SymCryptModElementCopy(
VOID
SymCryptModElementMaskedCopy(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PCSYMCRYPT_MODELEMENT peSrc,
_In_ PCSYMCRYPT_MODELEMENT peSrc,
_Out_ PSYMCRYPT_MODELEMENT peDst,
UINT32 mask )
{
@ -753,7 +753,7 @@ SymCryptModElementToInt(
PCUINT32 pData;
SYMCRYPT_ASSERT( piDst->nDigits >= pmMod->nDigits );
pData = SYMCRYPT_MOD_CALL( pmMod ) modPreGet( pmMod, peSrc, pbScratch, cbScratch );
SymCryptFdefModElementToIntGeneric( pmMod, pData, piDst, pbScratch, cbScratch );
@ -762,17 +762,17 @@ SymCryptModElementToInt(
SYMCRYPT_DISABLE_CFG
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptModElementSetValue(
_In_reads_bytes_( cbSrc ) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
SymCryptModElementSetValue(
_In_reads_bytes_( cbSrc ) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
PCSYMCRYPT_MODULUS pmMod,
_Out_ PSYMCRYPT_MODELEMENT peDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch )
{
SYMCRYPT_ERROR scError;
scError = SymCryptFdefModElementSetValueGeneric( pbSrc, cbSrc, format, pmMod, peDst, pbScratch, cbScratch );
if( scError == SYMCRYPT_NO_ERROR )
@ -785,11 +785,11 @@ SymCryptModElementSetValue(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptModElementGetValue(
SymCryptModElementGetValue(
PCSYMCRYPT_MODULUS pmMod,
_In_ PCSYMCRYPT_MODELEMENT peSrc,
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
SIZE_T cbDst,
_Out_writes_bytes_( cbDst ) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch )
@ -889,8 +889,8 @@ SymCryptModNeg(
SYMCRYPT_DISABLE_CFG
VOID
SYMCRYPT_CALL
SymCryptModElementSetValueUint32(
UINT32 value,
SymCryptModElementSetValueUint32(
UINT32 value,
_In_ PCSYMCRYPT_MODULUS pmMod,
_Out_ PSYMCRYPT_MODELEMENT peDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
@ -903,8 +903,8 @@ SymCryptModElementSetValueUint32(
VOID
SYMCRYPT_CALL
SymCryptModElementSetValueNegUint32(
UINT32 value,
SymCryptModElementSetValueNegUint32(
UINT32 value,
_In_ PCSYMCRYPT_MODULUS pmMod,
_Out_ PSYMCRYPT_MODELEMENT peDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
@ -994,7 +994,7 @@ SymCryptCreateTrialDivisionContext( UINT32 nDigits )
UINT32
SYMCRYPT_CALL
SymCryptIntFindSmallDivisor(
SymCryptIntFindSmallDivisor(
_In_ PCSYMCRYPT_TRIALDIVISION_CONTEXT pContext,
_In_ PCSYMCRYPT_INT piSrc,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,964 @@
//
// aesasm.symcryptasm Assembler code for fast AES on the amd64
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
// This code is derived from the AesFast implementation that
// Niels Ferguson wrote from scratch for BitLocker during Vista.
// That code is still in RSA32.
//
// This file has only been partially translated into symcryptasm, external function calls use the
// generic symcryptasm registers to convert different calling conventions into using the fixed register
// layout used in aesasm. It seems likely that changing which registers AES state will be kept in in
// the macros could impact on performance.
// In general we don't want to touch this code going forward; the vast majority of amd64 CPUs have aesni
// and use the Xmm Aes codepaths.
#include "symcryptasm_shared.cppasm"
#include "symcrypt_version.inc"
#define USE_BLOCK_FUNCTION 1 // Set to 1 to use block function, 0 to use block macro
#if defined(SYMCRYPT_MASM)
extern SymCryptAesSboxMatrixMult:DWORD
extern SymCryptAesInvSboxMatrixMult:DWORD
extern SymCryptAesInvSbox:BYTE
extern SymCryptFatal:NEAR
#elif defined(SYMCRYPT_GAS)
#else
#error Unknown target assembly
#endif
#if DBG
SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR ))
SET(SYMCRYPT_MAGIC_CONSTANT, (HEX(53316D76) + SYMCRYPT_CODE_VERSION)) // 0x53316D76 == 'S1mv'
MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
mov rax, [ptr + struct_magic_offset]
sub rax, ptr
cmp rax, SYMCRYPT_MAGIC_CONSTANT
jz check_magic_label
mov arg_1, HEX(6D616763) // 0x6D616763 == 'magc'
call SymCryptFatal
check_magic_label:
MACRO_END()
#else
MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
MACRO_END()
#endif
//
// Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure.
//
// SYMCRYPT_AES_EXPANDED_KEY struct
// RoundKey dq 2*N_ROUND_KEYS_IN_AESKEY dup (?) //
// lastEncRoundKey dq ? // pointer to last enc round key
// lastDecRoundKey dq ? // pointer to last dec round key
// SYMCRYPT_MAGIC_FIELD
// SYMCRYPT_AES_EXPANDED_KEY ends
SET(N_ROUND_KEYS_IN_AESKEY, 29)
SET(lastEncRoundKeyOffset, (29*16))
SET(lastDecRoundKeyOffset, (29*16 + 8))
SET(magicFieldOffset, (29*16 + 8 + 8))
//
// Shorthand for the 4 tables we will use
// We always use r11 to point to the (inv) SboxMatrixMult tables
//
#define SMM0 (r11 + 0)
#define SMM1 (r11 + 1024)
#define SMM2 (r11 + 2048)
#define SMM3 (r11 + 3072)
#define ISMM0 (r11 + 0)
#define ISMM1 (r11 + 1024)
#define ISMM2 (r11 + 2048)
#define ISMM3 (r11 + 3072)
MACRO_START(ENC_MIX, keyptr)
//
// Perform the unkeyed mixing function for encryption
// plus a key addition from the key pointer
//
// input:block is in eax, ebx, ecx, edx - r11 points to AesSboxMatrixMult
// New state ends up in eax, ebx, ecx, edx
// Used registers: esi, edi, ebp, r8
//
// We can use the e<xx> registers for the movzx as the
// upper 32 bits are automatically set to 0. This saves
// prefix bytes
//
// We use 32-bit registers to store the state.
// We tried using 64-bit registers, but the extra shifts
// cost too much.
// Using 32-bit throughout makes the key xor more expensive
// but we avoid having to combine the 32-bit halves into
// 64 bit.
//
movzx esi,al
mov esi,[SMM0 + 4 * rsi]
movzx edi,ah
shr eax,16
mov r8d,[SMM1 + 4 * rdi]
movzx ebp,al
mov ebp,[SMM2 + 4 * rbp]
movzx edi,ah
mov edi,[SMM3 + 4 * rdi]
movzx eax,bl
xor edi,[SMM0 + 4 * rax]
movzx eax,bh
shr ebx,16
xor esi,[SMM1 + 4 * rax]
movzx eax,bl
xor r8d,[SMM2 + 4 * rax]
movzx eax,bh
xor ebp,[SMM3 + 4 * rax]
movzx eax,cl
xor ebp,[SMM0 + 4 * rax]
movzx ebx,ch
shr ecx,16
xor edi,[SMM1 + 4 * rbx]
movzx eax,cl
xor esi,[SMM2 + 4 * rax]
movzx ebx,ch
xor r8d,[SMM3 + 4 * rbx]
movzx eax,dl
xor r8d,[SMM0 + 4 * rax]
movzx ebx,dh
shr edx,16
xor ebp,[SMM1 + 4 * rbx]
movzx eax,dl
xor edi,[SMM2 + 4 * rax]
movzx ebx,dh
xor esi,[SMM3 + 4 * rbx]
mov eax, [keyptr]
mov ebx, [keyptr + 4]
xor eax, esi
mov ecx, [keyptr + 8]
xor ebx, edi
mov edx, [keyptr + 12]
xor ecx, ebp
xor edx, r8d
MACRO_END()
MACRO_START(DEC_MIX, keyptr)
//
// Perform the unkeyed mixing function for decryption
//
// input:block is in eax, ebx, ecx, edx
// r11 points to AesInvSboxMatrixMult
// New state ends up in esi, edi, ebp, r8d
movzx esi,al
mov esi,[ISMM0 + 4 * rsi]
movzx edi,ah
shr eax,16
mov edi,[ISMM1 + 4 * rdi]
movzx ebp,al
mov ebp,[ISMM2 + 4 * rbp]
movzx eax,ah
mov r8d,[ISMM3 + 4 * rax]
movzx eax,bl
xor edi,[ISMM0 + 4 * rax]
movzx eax,bh
shr ebx,16
xor ebp,[ISMM1 + 4 * rax]
movzx eax,bl
xor r8d,[ISMM2 + 4 * rax]
movzx eax,bh
xor esi,[ISMM3 + 4 * rax]
movzx eax,cl
xor ebp,[ISMM0 + 4 * rax]
movzx ebx,ch
shr ecx,16
xor r8d,[ISMM1 + 4 * rbx]
movzx eax,cl
xor esi,[ISMM2 + 4 * rax]
movzx ebx,ch
xor edi,[ISMM3 + 4 * rbx]
movzx eax,dl
xor r8d,[ISMM0 + 4 * rax]
movzx ebx,dh
shr edx,16
xor esi,[ISMM1 + 4 * rbx]
movzx eax,dl
xor edi,[ISMM2 + 4 * rax]
movzx ebx,dh
xor ebp,[ISMM3 + 4 * rbx]
mov eax, [keyptr]
mov ebx, [keyptr + 4]
xor eax, esi
mov ecx, [keyptr + 8]
xor ebx, edi
mov edx, [keyptr + 12]
xor ecx, ebp
xor edx, r8d
MACRO_END()
MACRO_START(AES_ENCRYPT_MACRO, AesEncryptMacroLoopLabel)
//
// Plaintext in eax, ebx, ecx, edx
// r9 points to first round key to use (modified)
// r10 is last key to use (unchanged)
// r11 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
//
// This macro is free to unroll the cipher completely, or to use a loop
// over r9
//
//
// xor in first round key
//
xor eax,[r9]
xor ebx,[r9+4]
xor ecx,[r9+8]
xor edx,[r9+12]
add r9,32
// Do not unroll the loop at all because very few CPUs use this codepath so it's worth
// minimizing the binary size
AesEncryptMacroLoopLabel:
// Block is eax, ebx, ecx, edx
// r9-16 points to next round key
ENC_MIX r9-16
cmp r9,r10
lea r9,[r9+16]
jc AesEncryptMacroLoopLabel
//
// Now for the final round
// We use the fact that SboxMatrixMult[0] table is also
// an Sbox table if you use the second element of each entry.
//
// Result is in esi, edi, ebp, r8d
//
movzx esi,al
movzx esi,byte ptr[r11 + 1 + 4*rsi]
movzx edi,ah
shr eax,16
movzx r8d,byte ptr[r11 + 1 + 4*rdi]
movzx ebp,al
shl r8d,8
movzx ebp,byte ptr[r11 + 1 + 4*rbp]
shl ebp,16
movzx edi,ah
movzx edi,byte ptr[r11 + 1 + 4*rdi]
shl edi,24
movzx eax,bl
movzx eax,byte ptr[r11 + 1 + 4*rax]
or edi,eax
movzx eax,bh
shr ebx,16
movzx eax,byte ptr[r11 + 1 + 4*rax]
shl eax,8
or esi,eax
movzx eax,bl
movzx eax,byte ptr[r11 + 1 + 4*rax]
movzx ebx,bh
shl eax,16
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
or r8d,eax
shl ebx,24
or ebp,ebx
movzx eax,cl
movzx ebx,ch
movzx eax,byte ptr[r11 + 1 + 4*rax]
shr ecx,16
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
shl ebx,8
or ebp,eax
or edi,ebx
movzx eax,cl
movzx eax,byte ptr[r11 + 1 + 4*rax]
movzx ebx,ch
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
shl eax,16
shl ebx,24
or esi,eax
or r8d,ebx
movzx eax,dl
movzx ebx,dh
movzx eax,byte ptr[r11 + 1 + 4*rax]
shr edx,16
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
shl ebx,8
or r8d,eax
or ebp,ebx
movzx eax,dl
movzx eax,byte ptr[r11 + 1 + 4*rax]
movzx ebx,dh
movzx ebx,byte ptr[r11 + 1 + 4*rbx]
shl eax,16
shl ebx,24
or edi,eax
or esi,ebx
//
// xor in final round key
//
xor r8d,[r10+12]
xor esi,[r10]
xor edi,[r10+4]
xor ebp,[r10+8]
MACRO_END()
MACRO_START(AES_DECRYPT_MACRO, AesDecryptMacroLoopLabel)
//
// Ciphertext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to InvSboxMatrixMult (unchanged)
// r12 points to InvSbox (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
//
//
// xor in first round key
//
xor eax,[r9]
xor ebx,[r9+4]
xor ecx,[r9+8]
xor edx,[r9+12]
add r9,32
// Do not unroll the loop at all because very few CPUs use this codepath so it's worth
// minimizing the binary size
AesDecryptMacroLoopLabel:
// Block is eax, ebx, ecx, edx
// r9-16 points to next round key
DEC_MIX r9-16
cmp r9,r10
lea r9,[r9+16]
jc AesDecryptMacroLoopLabel
//
// Now for the final round
// Result is in esi, edi, ebp, r8d
//
movzx esi,al
movzx esi,byte ptr[r12 + rsi]
movzx edi,ah
shr eax,16
movzx edi,byte ptr[r12 + rdi]
movzx ebp,al
shl edi,8
movzx ebp,byte ptr[r12 + rbp]
shl ebp,16
movzx eax,ah
movzx r8d,byte ptr[r12 + rax]
shl r8d,24
movzx eax,bl
movzx eax,byte ptr[r12 + rax]
or edi,eax
movzx eax,bh
shr ebx,16
movzx eax,byte ptr[r12 + rax]
shl eax,8
or ebp,eax
movzx eax,bl
movzx eax,byte ptr[r12 + rax]
movzx ebx,bh
shl eax,16
movzx ebx,byte ptr[r12 + rbx]
or r8d,eax
shl ebx,24
or esi,ebx
movzx eax,cl
movzx ebx,ch
movzx eax,byte ptr[r12 + rax]
shr ecx,16
movzx ebx,byte ptr[r12 + rbx]
shl ebx,8
or ebp,eax
or r8d,ebx
movzx eax,cl
movzx eax,byte ptr[r12 + rax]
movzx ebx,ch
movzx ebx,byte ptr[r12 + rbx]
shl eax,16
shl ebx,24
or esi,eax
or edi,ebx
movzx eax,dl
movzx ebx,dh
movzx eax,byte ptr[r12 + rax]
shr edx,16
movzx ebx,byte ptr[r12 + rbx]
shl ebx,8
or r8d,eax
or esi,ebx
movzx eax,dl
movzx eax,byte ptr[r12 + rax]
movzx ebx,dh
movzx ebx,byte ptr[r12 + rbx]
shl eax,16
shl ebx,24
or edi,eax
or ebp,ebx
//
// xor in final round key
//
xor esi,[r10]
xor edi,[r10+4]
xor ebp,[r10+8]
xor r8d,[r10+12]
MACRO_END()
#if USE_BLOCK_FUNCTION
//
// We use a block function, the AES_ENCRYPT macro merely calls the function
//
MACRO_START(AES_ENCRYPT, loopLabel)
call SymCryptAesEncryptAsmInternal
MACRO_END()
MACRO_START(AES_DECRYPT, loopLabel)
call SymCryptAesDecryptAsmInternal
MACRO_END()
//========================================
// SymCryptAesEncryptAsmInternal
//
// Internal AES encryption routine with modified calling convention.
// This function has the exact same calling convention as the AES_ENCRYPT_MACRO
FUNCTION_START(SymCryptAesEncryptAsmInternal, 0, 0)
AES_ENCRYPT_MACRO SymCryptAesEncryptAsmInternalLoop
FUNCTION_END(SymCryptAesEncryptAsmInternal)
//========================================
// SymCryptAesDecryptAsmInternal
//
// Internal AES encryption routine with modified calling convention.
// This function has the exact same calling convention as the AES_DECRYPT_MACRO
//
FUNCTION_START(SymCryptAesDecryptAsmInternal, 0, 0)
AES_DECRYPT_MACRO SymCryptAesDecryptAsmInternalLoop
FUNCTION_END(SymCryptAesDecryptAsmInternal)
#else
//
// No block function, use the macro directly
//
MACRO_START(AES_ENCRYPT, loopLabel)
AES_ENCRYPT_MACRO loopLabel
MACRO_END()
MACRO_START(AES_DECRYPT, loopLabel)
AES_DECRYPT_MACRO loopLabel
MACRO_END()
#endif
//
//VOID
//SYMCRYPT_CALL
//SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext,
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext )
//
NESTED_FUNCTION_START(SymCryptAesEncryptAsm, 3, 15)
SYMCRYPT_CHECK_MAGIC SymCryptAesEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
// Here we convert from whatever calling convention we are called from externally to our
// AES internal calling convention.
// We need to be careful that we don't overwrite an argument register before we copy it to
// the place it is needed internally in the AES functions.
// There is no automatic method for checking we do this correctly - modify with care!
// In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
// rcx, rdx, r8, rdi, rsi
mov r10, [Q1 + lastEncRoundKeyOffset]
mov r9, Q1
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
//
// Load the plaintext
//
mov eax,[Q2 ]
mov ebx,[Q2 + 4]
mov ecx,[Q2 + 8]
mov edx,[Q2 + 12]
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
AES_ENCRYPT SymCryptAesEncryptAsmLoop
// Plaintext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
// retrieve pbCiphertext using Q0 because it is always rax regardless of calling convention
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
mov [Q0 ], esi
mov [Q0 + 4], edi
mov [Q0 + 8], ebp
mov [Q0 + 12], r8d
NESTED_FUNCTION_END(SymCryptAesEncryptAsm)
//
//VOID
//SYMCRYPT_CALL
//SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext,
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext )
NESTED_FUNCTION_START(SymCryptAesDecryptAsm, 3, 15)
SYMCRYPT_CHECK_MAGIC SymCryptAesDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
// Here we convert from whatever calling convention we are called from externally to our
// AES internal calling convention.
// We need to be careful that we don't overwrite an argument register before we copy or use
// the value appropriately for use in the AES functions.
// There is no automatic method for checking we do this correctly - modify with care!
// In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
// rcx, rdx, r8, rdi, rsi
mov r9,[Q1 + lastEncRoundKeyOffset]
mov r10,[Q1 + lastDecRoundKeyOffset]
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
mov eax,[Q2 ]
mov ebx,[Q2 + 4]
mov ecx,[Q2 + 8]
mov edx,[Q2 + 12]
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
AES_DECRYPT SymCryptAesDecryptAsmLoop
// Ciphertext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to InvSboxMatrixMult (unchanged)
// r12 points to InvSbox (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
// retrieve pbPlaintext using Q0 because it is always rax regardless of calling convention
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
mov [Q0 ], esi
mov [Q0 + 4], edi
mov [Q0 + 8], ebp
mov [Q0 + 12], r8d
NESTED_FUNCTION_END(SymCryptAesDecryptAsm)
//VOID
//SYMCRYPT_CALL
//SymCryptAesCbcEncrypt(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
NESTED_FUNCTION_START(SymCryptAesCbcEncryptAsm, 5, 15)
// Here we convert from whatever calling convention we are called from externally to our
// AES internal calling convention.
// We need to be careful that we don't overwrite an argument register before we copy or use
// the value appropriately for use in the AES functions.
// There is no automatic method for checking we do this correctly - modify with care!
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
// rcx, rdx, r8, r9, r10, rdi, rsi
SYMCRYPT_CHECK_MAGIC SymCryptAesCbcEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
and Q5, NOT 15 // only deal with whole # blocks
jz SymCryptAesCbcEncryptNoData
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue
mov rax, Q2 // rax = pbChainingValue
mov r13, Q3 // r13 = pbSrc
mov r15, Q5 // r15 = cbData
mov r14, Q4 // r14 = pbDst
add r15, Q3 // r15 = pbSrcEnd
mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key
mov r12,Q1 // r12 = first round key to use
//
// Load the chaining state from pbChainingValue
//
mov esi,[rax ]
mov edi,[rax + 4]
mov ebp,[rax + 8]
mov r8d,[rax + 12]
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
ALIGN(16)
SymCryptAesCbcEncryptAsmLoop:
// Loop register setup
// r10 = last round key to use
// r12 = first round key to use
// r13 = pbSrc
// r14 = pbDst
// r15 = pbSrcEnd
// chaining state in (esi,edi,ebp,r8d)
mov eax, [r13]
mov r9, r12
mov ebx, [r13+4]
xor eax, esi
mov ecx, [r13+8]
xor ebx, edi
xor ecx, ebp
mov edx, [r13+12]
xor edx, r8d
add r13, 16
AES_ENCRYPT SymCryptAesCbcEncryptAsmInnerLoop
//
// Plaintext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
//
mov [r14], esi
mov [r14+4], edi
mov [r14+8], ebp
mov [r14+12], r8d
add r14, 16
cmp r13, r15
jb SymCryptAesCbcEncryptAsmLoop
//
// Update the chaining value
//
mov Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
mov [Q0], esi
mov [Q0+4], edi
mov [Q0+8], ebp
mov [Q0+12], r8d
SymCryptAesCbcEncryptNoData:
NESTED_FUNCTION_END(SymCryptAesCbcEncryptAsm)
//VOID
//SYMCRYPT_CALL
//SymCryptAesCbcDecrypt(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
NESTED_FUNCTION_START(SymCryptAesCbcDecryptAsm, 5, 15)
// Here we convert from whatever calling convention we are called from externally to our
// AES internal calling convention.
// We need to be careful that we don't overwrite an argument register before we copy or use
// the value appropriately for use in the AES functions.
// There is no automatic method for checking we do this correctly - modify with care!
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
// rcx, rdx, r8, r9, r10, rdi, rsi
SYMCRYPT_CHECK_MAGIC SymCryptAesCbcDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
and Q5, NOT 15
jz SymCryptAesCbcDecryptNoData
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingValue
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], Q3 // save pbSrc
lea r14, [Q5 - 16]
lea r15, [Q4 + r14] // r15 = pbDst pointed to last block
add r14, Q3 // r14 = pbSrc pointed to last block
mov r13,[Q1 + lastEncRoundKeyOffset]
mov r10,[Q1 + lastDecRoundKeyOffset]
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
lea r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
//
// Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
//
mov eax,[r14]
mov ebx,[r14+4]
mov ecx,[r14+8]
mov edx,[r14+12]
mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], eax
mov [rsp + GET_MEMSLOT_OFFSET(slot2)+4], ebx
mov [rsp + GET_MEMSLOT_OFFSET(slot3) ], ecx
mov [rsp + GET_MEMSLOT_OFFSET(slot3)+4], edx
jmp SymCryptAesCbcDecryptAsmLoopEntry
ALIGN(16)
SymCryptAesCbcDecryptAsmLoop:
// Loop register setup
// r13 = first round key to use
// r14 = pbSrc
// r15 = pbDst
// [slot1] = pbSrcStart
// current ciphertext block (esi,edi,ebp,r8d)
mov eax,[r14-16]
mov ebx,[r14-12]
xor esi,eax
mov ecx,[r14-8]
xor edi,ebx
mov [r15],esi
mov edx,[r14-4]
xor ebp,ecx
mov [r15+4],edi
xor r8d,edx
mov [r15+8],ebp
mov [r15+12],r8d
sub r14,16
sub r15,16
SymCryptAesCbcDecryptAsmLoopEntry:
mov r9, r13
AES_DECRYPT SymCryptAesCbcDecryptAsmInnerLoop
//
// Ciphertext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to InvSboxMatrixMult (unchanged)
// r12 points to InvSbox (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
//
cmp r14, [rsp + GET_MEMSLOT_OFFSET(slot1)] // pbSrc
ja SymCryptAesCbcDecryptAsmLoop
mov rbx,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingValue
xor esi,[rbx]
xor edi,[rbx+4]
xor ebp,[rbx+8]
xor r8d,[rbx+12]
mov [r15], esi
mov [r15+4], edi
mov [r15+8], ebp
mov [r15+12], r8d
//
// Update the chaining value to the last ciphertext block
//
mov rax,[rsp + GET_MEMSLOT_OFFSET(slot2)]
mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot3)]
mov [rbx], rax
mov [rbx+8], rcx
SymCryptAesCbcDecryptNoData:
NESTED_FUNCTION_END(SymCryptAesCbcDecryptAsm)
//VOID
//SYMCRYPT_CALL
//SymCryptAesCtrMsb64(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
NESTED_FUNCTION_START(SymCryptAesCtrMsb64Asm, 5, 15)
// Here we convert from whatever calling convention we are called from externally to our
// AES internal calling convention.
// We need to be careful that we don't overwrite an argument register before we copy or use
// the value appropriately for use in the AES functions.
// There is no automatic method for checking we do this correctly - modify with care!
// In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
// rcx, rdx, r8, r9, r10, rdi, rsi
SYMCRYPT_CHECK_MAGIC SymCryptAesCtrMsb64AsmCheckMagic, Q1, magicFieldOffset, Q1
and Q5, NOT 15 // only deal with whole # blocks
jz SymCryptAesCtrMsb64NoData
mov [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2 // save pbChainingState
mov rax, Q2 // rax = pbChainingValue
mov r13, Q3 // r13 = pbSrc
mov r14, Q5 // r14 = cbData
mov r15, Q4 // r15 = pbDst
add r14, Q3 // r14 = cbData + pbSrc = pbSrcEnd
mov r10,[Q1 + lastEncRoundKeyOffset] // r10 = last enc round key
mov r12,Q1 // r12 = first round key to use
lea r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
//
// Load the chaining state
//
mov rcx, [rax + 8]
mov rax, [rax ]
//
// Store it in our local copy (we have no register free to keep pbChainingState in)
//
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rcx
//
// Move to the right registers
//
mov rbx, rax
mov rdx, rcx
shr rbx, 32
shr rdx, 32
ALIGN(16)
SymCryptAesCtrMsb64AsmLoop:
// Loop invariant
// Current chaining state is in (eax, ebx, ecx, edx)
// r10 = last round key to use
// r11 = SboxMatrixMult
// r12 = first round key to use
// r13 = pbSrc
// r14 = pbSrcEnd
// r15 = pbDst
// [slot1..slot2] = 16 bytes chaining state block
mov r9, r12
AES_ENCRYPT SymCryptAesCtrMsb64AsmInnerLoop
//
// Plaintext in eax, ebx, ecx, edx
// r9 points to first round key to use
// r10 is last key to use (unchanged)
// r11 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in esi, edi, ebp, r8d
//
// To improve latency, we FIRST
// load the chaining state, increment the counter, and write it back.
// leave the state in the (eax, ebx, ecx, edx) registers
mov eax,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 0]
mov ebx,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 4]
mov rcx,[rsp + GET_MEMSLOT_OFFSET(slot2) ]
bswap rcx
add rcx, 1
bswap rcx
mov [rsp + GET_MEMSLOT_OFFSET(slot2) ], rcx
mov rdx, rcx
shr rdx, 32
// THEN we process the XOR of the key stream with the data
// This order is faster as we need to have the chaining state done
// before we can proceed, but there are no dependencies on the data result
// So we can loop back to the beginning while the data stream read/writes are
// still in flight.
//
// xor with the source stream
xor esi,[r13 + 0 ]
xor edi,[r13 + 4 ]
xor ebp,[r13 + 8 ]
xor r8d,[r13 + 12]
// store at the destination
mov [r15 + 0], esi
mov [r15 + 4], edi
mov [r15 + 8], ebp
mov [r15 + 12], r8d
add r13, 16 // pbSrc += 16
add r15, 16 // pbDst += 16
cmp r13, r14
jb SymCryptAesCtrMsb64AsmLoop
//
// Copy back the chaining value - we only modified the last 8 bytes, so that is all we copy
//
mov rsi,[rsp + GET_MEMSLOT_OFFSET(slot0)] // pbChainingState
mov [rsi + 8], ecx
mov [rsi + 12], edx
//
// Wipe the chaining value on stack
//
xor rax, rax
mov [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
mov [rsp + GET_MEMSLOT_OFFSET(slot2)], rax
SymCryptAesCtrMsb64NoData:
NESTED_FUNCTION_END(SymCryptAesCtrMsb64Asm)
FILE_END()

Просмотреть файл

@ -1,529 +0,0 @@
;
; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
;
; This file contains alternative routines that are used for modular computations
; where the modulus is 257-384 or 513-576 bits long.
; (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
;
; The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
;
; Most of this code is a direct copy of the default code.
; AMD64 digits are now 512 bits.
; We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
; are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
include ksamd64.inc
include symcrypt_version.inc
include symcrypt_magic.inc
include C_asm_shared.inc
; A digit consists of 4 words of 64 bits each
;UINT32
;SYMCRYPT_CALL
;SymCryptFdef369RawAddAsm(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
; UINT32 nDigits );
LEAF_ENTRY SymCryptFdef369RawAddAsm, _TEXT
; rcx = Src1
; rdx = Src2
; r8 = Dst
; r9 = nDigits
add r9, 1
xor rax, rax
xor r10, r10
; Cy = 0
SymCryptFdef369RawAddAsmLoop:
; carry is in the carry flag
mov rax,[rcx]
adc rax,[rdx]
mov [r8],rax
mov rax,[rcx + 8]
adc rax,[rdx + 8]
mov [r8 + 8], rax
mov rax,[rcx + 16]
adc rax,[rdx + 16]
mov [r8 + 16], rax
lea rcx, [rcx + 24]
lea rdx, [rdx + 24]
lea r8, [r8 + 24]
dec r9d
jnz SymCryptFdef369RawAddAsmLoop
mov rax, r10
adc rax, r10
ret
LEAF_END SymCryptFdef369RawAddAsm, _TEXT
;UINT32
;SYMCRYPT_CALL
;SymCryptFdefRawSubAsm(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
; UINT32 nDigits );
LEAF_ENTRY SymCryptFdef369RawSubAsm, _TEXT
; rcx = Src1
; rdx = Src2
; r8 = Dst
; r9 = nDigits
add r9, 1
xor rax, rax
xor r10, r10
SymCryptFdef369RawSubAsmLoop:
; carry is in the carry flag
mov rax,[rcx]
sbb rax,[rdx]
mov [r8],rax
mov rax,[rcx + 8]
sbb rax,[rdx + 8]
mov [r8 + 8], rax
mov rax,[rcx + 16]
sbb rax,[rdx + 16]
mov [r8 + 16], rax
lea rcx, [rcx + 24]
lea rdx, [rdx + 24]
lea r8, [r8 + 24]
dec r9d
jnz SymCryptFdef369RawSubAsmLoop
mov rax, r10
adc rax, r10
ret
LEAF_END SymCryptFdef369RawSubAsm, _TEXT
;VOID
;SYMCRYPT_CALL
;SymCryptFdefMaskedCopy(
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
; UINT32 nDigits,
; UINT32 mask )
LEAF_ENTRY SymCryptFdef369MaskedCopyAsm, _TEXT
add r8d, 1
movsxd r9, r9d
SymCryptFdef369MaskedCopyAsmLoop:
mov rax, [rcx]
mov r10, [rdx]
xor rax, r10
and rax, r9
xor rax, r10
mov [rdx], rax
mov rax, [rcx + 8]
mov r10, [rdx + 8]
xor rax, r10
and rax, r9
xor rax, r10
mov [rdx + 8], rax
mov rax, [rcx + 16]
mov r10, [rdx + 16]
xor rax, r10
and rax, r9
xor rax, r10
mov [rdx + 16], rax
; Move on to the next digit
add rcx, 24
add rdx, 24
sub r8d, 1
jnz SymCryptFdef369MaskedCopyAsmLoop
ret
LEAF_END SymCryptFdef369MaskedCopyAsm, _TEXT
;VOID
;SYMCRYPT_CALL
;SymCryptFdefRawMul(
; _In_reads_(nWords1) PCUINT32 pSrc1,
; UINT32 nDigits1,
; _In_reads_(nWords2) PCUINT32 pSrc2,
; UINT32 nDigits2,
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
SymCryptFdef369RawMulAsm_Frame struct
SavedRbx dq ?
SavedRdi dq ?
SavedRsi dq ?
SavedR13 dq ?
SavedR12 dq ?
returnaddress dq ?
Arg1Home dq ?
Arg2Home dq ?
Arg3Home dq ?
Arg4Home dq ?
pDst dq ?
SymCryptFdef369RawMulAsm_Frame ends
NESTED_ENTRY SymCryptFdef369RawMulAsm, _TEXT
rex_push_reg rbx
push_reg r12
push_reg r13
push_reg rsi
push_reg rdi
END_PROLOGUE
; Basic structure:
; for each word in Src1:
; Dst += Src2 * word
; Register assignments
;
; rax = tmp for mul
; rbx = word from Src1 to multiply with
; rcx = pSrc1 (updated in outer loop)
; rdx = tmp for mul
; rsi = inner loop pointer into pSrc2
; rdi = inner loop pointer into pDst
; r8 = pSrc2
; r9 = nDigits2
; r10 = pDst (incremented in outer loop)
; r11 = # words left from Src1 to process
; r12 = carry
; r13 = inner loop counter
add edx, 1
add r9d, 1
lea r11d, [edx + 2*edx] ; nDigits1 * 3 = # words in Src1 to process
mov r10, [rsp + SymCryptFdef369RawMulAsm_Frame.pDst ]
; Outer loop invariant established: rcx, r8, r9, r10
mov rsi, r8 ; rsi = pSrc2
mov rdi, r10 ; rdi = pDst + outer loop ctr
mov rbx, [rcx] ; mulword
xor r12, r12
mov r13d, r9d
; First inner loop overwrites Dst, which avoids adding the current Dst value
SymCryptFdef369RawMulAsmLoop1:
mov rax, [rsi]
mul rbx
add rax, r12
adc rdx, 0
mov [rdi], rax
mov r12, rdx
mov rax, [rsi + 8]
mul rbx
add rax, r12
adc rdx, 0
mov [rdi + 8], rax
mov r12, rdx
mov rax, [rsi + 16]
mul rbx
add rax, r12
adc rdx, 0
mov [rdi + 16], rax
mov r12, rdx
add rsi, 24
add rdi, 24
sub r13d,1
jnz SymCryptFdef369RawMulAsmLoop1
mov [rdi], rdx ; write last word, cannot overflow because Dst is at least 2 digits long
sub r11d, 1
SymCryptFdef369RawMulAsmLoopOuter:
add rcx, 8 ; move to next word of pSrc1
add r10, 8 ; move Dst pointer one word over
mov rbx, [rcx]
mov rsi, r8
mov rdi, r10
xor r12, r12
mov r13d, r9d
SymCryptFdef369RawMulAsmLoop2:
mov rax, [rsi]
mul rbx
add rax, [rdi]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [rdi], rax
mov r12, rdx
mov rax, [rsi + 8]
mul rbx
add rax, [rdi + 8]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [rdi + 8], rax
mov r12, rdx
mov rax, [rsi + 16]
mul rbx
add rax, [rdi + 16]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [rdi + 16], rax
mov r12, rdx
add rsi, 24
add rdi, 24
sub r13d,1
jnz SymCryptFdef369RawMulAsmLoop2
mov [rdi], rdx ; write next word. (stays within Dst buffer)
sub r11d, 1
jnz SymCryptFdef369RawMulAsmLoopOuter
BEGIN_EPILOGUE
pop rdi
pop rsi
pop r13
pop r12
pop rbx
ret
NESTED_END SymCryptFdef369RawMulAsm, _TEXT
;VOID
;SymCryptFdefMontgomeryReduceAsm(
; _In_ PCSYMCRYPT_MODULUS pmMod,
; _In_ PUINT32 pSrc,
; _Out_ PUINT32 pDst )
NESTED_ENTRY SymCryptFdef369MontgomeryReduceAsm, _TEXT
rex_push_reg rbx
push_reg r12
push_reg r13
push_reg r14
push_reg rsi
push_reg rdi
push_reg rbp
END_PROLOGUE
mov r11, rdx ; r11 = pSrc
mov ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64] ; nDigits
add ebp, 1
mov r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64] ; inv64
lea rcx, [rcx + SymCryptModulusValueOffsetAmd64] ; modulus value
lea edi, [ebp + 2*ebp] ; outer loop counter, in words
xor r14d, r14d
; General register allocations
; rax = multiply result
; rbx = multiplier in inner loop
; rcx = pointer to modulus value
; rdx = multiply result
; rsi = loop counter
; rdi = loop counter
; rbp = nDigits
; r8 = pDst
; r9 = running pointer in Src
; r10 = running pointer in Mod
; r11 = pSrc (updated in outer loop)
; r12 = carry
; r13 = pmMod->tm.montgomery.inv64
; r14 = carry out from last word of previous loop iteration
SymCryptFdef369MontgomeryReduceAsmOuterLoop:
; start decoder with a few simple instructions, including at least one that requires
; a uop execution and is on the critical path
mov rbx, [r11] ; fetch word of Src we want to set to zero
mov r10, r11
mov r9, rcx
imul rbx, r13 ; lower word is same for signed & unsigned multiply
mov esi, ebp
xor r12d, r12d
SymCryptFdef369MontgomeryReduceAsmInnerloop:
; rax = mul scratch
; rbx = multiplier
; rcx = pointer to modulus value
; rdx = mul scratch
; edi = outer loop counter (words)
; esi = inner loop counter (digits)
; r9 = running ptr to modulus
; r10 = running ptr to input/scratch
; r12 = carry (64 bits)
mov rax, [r9]
mul rbx
add rax, [r10]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [r10], rax
mov r12, rdx
mov rax, [r9 + 8]
mul rbx
add rax, [r10 + 8]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [r10 + 8], rax
mov r12, rdx
mov rax, [r9 + 16]
mul rbx
add rax, [r10 + 16]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [r10 + 16], rax
mov r12, rdx
add r9, 24
add r10, 24
sub esi,1
jnz SymCryptFdef369MontgomeryReduceAsmInnerloop
add r12, r14
mov r14d, 0
adc r14, 0
add r12, [r10]
adc r14, 0
mov [r10], r12
add r11, 8
sub edi, 1
jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop
;
; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
;
; First we compute the pSrc result minus the modulus into the destination
mov esi, ebp ; loop ctr
mov r10, r11 ; pSrc
mov r9, rcx ; pMod
mov r12, r8 ; pDst
; Cy = 0 because the last 'sub edi,1' resulted in 0
SymCryptFdef369MontgomeryReduceAsmSubLoop:
mov rax,[r10]
sbb rax,[r9]
mov [r12], rax
mov rax,[r10 + 8]
sbb rax,[r9 + 8]
mov [r12 + 8], rax
mov rax,[r10 + 16]
sbb rax,[r9 + 16]
mov [r12 + 16], rax
lea r10,[r10+24]
lea r9, [r9 +24]
lea r12,[r12+24]
dec esi
jnz SymCryptFdef369MontgomeryReduceAsmSubLoop
; Finally a masked copy form pSrc to pDst
; copy if: r14 == 0 && Cy = 1
sbb r14, 0 ; mask (64 bits)
SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
mov rax, [r11]
mov rsi, [r8]
xor rax, rsi
and rax, r14
xor rax, rsi
mov [r8], rax
mov rax, [r11 + 8]
mov rsi, [r8 + 8]
xor rax, rsi
and rax, r14
xor rax, rsi
mov [r8 + 8], rax
mov rax, [r11 + 16]
mov rsi, [r8 + 16]
xor rax, rsi
and rax, r14
xor rax, rsi
mov [r8 + 16], rax
; Move on to the next digit
add r11, 24
add r8, 24
sub ebp, 1
jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
BEGIN_EPILOGUE
pop rbp
pop rdi
pop rsi
pop r14
pop r13
pop r12
pop rbx
ret
NESTED_END SymCryptFdef369MontgomeryReduceAsm, _TEXT
end

Просмотреть файл

@ -0,0 +1,451 @@
//
// fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// This file contains alternative routines that are used for modular computations
// where the modulus is 257-384 or 513-576 bits long.
// (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
//
// The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
//
// Most of this code is a direct copy of the default code.
// AMD64 digits are now 512 bits.
// We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
// are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
#include "symcryptasm_shared.cppasm"
// A digit consists of 4 words of 64 bits each
//UINT32
//SYMCRYPT_CALL
// SymCryptFdef369RawAddAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
// UINT32 nDigits )
FUNCTION_START(SymCryptFdef369RawAddAsm, 4, 5)
inc D4
xor Q0, Q0
SymCryptFdef369RawAddAsmLoop:
// carry is in the carry flag
mov Q0,[Q1]
adc Q0,[Q2]
mov [Q3],Q0
mov Q0,[Q1 + 8]
adc Q0,[Q2 + 8]
mov [Q3 + 8], Q0
mov Q0,[Q1 + 16]
adc Q0,[Q2 + 16]
mov [Q3 + 16], Q0
lea Q1, [Q1 + 24]
lea Q2, [Q2 + 24]
lea Q3, [Q3 + 24]
dec D4
jnz SymCryptFdef369RawAddAsmLoop
mov Q0, 0
adc Q0, Q0
FUNCTION_END(SymCryptFdef369RawAddAsm)
// UINT32
// SYMCRYPT_CALL
// SymCryptFdef369RawSubAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
// UINT32 nDigits )
FUNCTION_START(SymCryptFdef369RawSubAsm, 4, 5)
inc D4
xor Q0, Q0
SymCryptFdef369RawSubAsmLoop:
// carry is in the carry flag
mov Q0,[Q1]
sbb Q0,[Q2]
mov [Q3],Q0
mov Q0,[Q1 + 8]
sbb Q0,[Q2 + 8]
mov [Q3 + 8], Q0
mov Q0,[Q1 + 16]
sbb Q0,[Q2 + 16]
mov [Q3 + 16], Q0
lea Q1, [Q1 + 24]
lea Q2, [Q2 + 24]
lea Q3, [Q3 + 24]
dec D4
jnz SymCryptFdef369RawSubAsmLoop
mov Q0, 0
adc Q0, Q0
FUNCTION_END(SymCryptFdef369RawSubAsm)
// VOID
// SYMCRYPT_CALL
// SymCryptFdef369MaskedCopyAsm(
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
// UINT32 nDigits,
// UINT32 mask )
FUNCTION_START(SymCryptFdef369MaskedCopyAsm, 4, 6)
inc D3
movsxd Q4, D4
SymCryptFdef369MaskedCopyAsmLoop:
mov Q0, [Q1]
mov Q5, [Q2]
xor Q0, Q5
and Q0, Q4
xor Q0, Q5
mov [Q2], Q0
mov Q0, [Q1 + 8]
mov Q5, [Q2 + 8]
xor Q0, Q5
and Q0, Q4
xor Q0, Q5
mov [Q2 + 8], Q0
mov Q0, [Q1 + 16]
mov Q5, [Q2 + 16]
xor Q0, Q5
and Q0, Q4
xor Q0, Q5
mov [Q2 + 16], Q0
// Move on to the next digit
add Q1, 24
add Q2, 24
dec D3
jnz SymCryptFdef369MaskedCopyAsmLoop
FUNCTION_END(SymCryptFdef369MaskedCopyAsm)
// VOID
// SYMCRYPT_CALL
// SymCryptFdef369RawMulAsm(
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
// UINT32 nDigits1,
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
// UINT32 nDigits2,
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
MUL_FUNCTION_START(SymCryptFdef369RawMulAsm, 5, 11)
// Basic structure:
// for each word in Src1:
// Dst += Src2 * word
// Register assignments
//
// Q0 = tmp for mul
// QH = tmp for mul
// Q1 = pSrc1 (updated in outer loop)
// D2 = # words left from Src1 to process
// Q3 = pSrc2
// Q4 = nDigits2
// Q5 = pDst (incremented in outer loop)
// Q6 = inner loop pointer into pSrc2
// Q7 = inner loop pointer into pDst
// Q8 = word from Src1 to multiply with
// Q9 = carry
// D10 = inner loop counter
inc D2
inc D4
lea D2, [D2 + 2*D2] // nDigits1 * 3 = # words in Src1 to process
// Outer loop invariant established: Q1, Q3, D4, Q5
mov Q6, Q3 // Q6 = pSrc2
mov Q7, Q5 // Q7 = pDst + outer loop ctr
mov Q8, [Q1] // mulword
xor Q9, Q9
mov D10, D4
// First inner loop overwrites Dst, which avoids adding the current Dst value
ALIGN(16)
SymCryptFdef369RawMulAsmLoop1:
mov Q0, [Q6]
mul Q8
add Q0, Q9
adc QH, 0
mov [Q7], Q0
mov Q9, QH
mov Q0, [Q6 + 8]
mul Q8
add Q0, Q9
adc QH, 0
mov [Q7 + 8], Q0
mov Q9, QH
mov Q0, [Q6 + 16]
mul Q8
add Q0, Q9
adc QH, 0
mov [Q7 + 16], Q0
mov Q9, QH
add Q6, 24
add Q7, 24
dec D10
jnz SymCryptFdef369RawMulAsmLoop1
mov [Q7], QH // write last word, cannot overflow because Dst is at least 2 digits long
dec D2
ALIGN(16)
SymCryptFdef369RawMulAsmLoopOuter:
add Q1, 8 // move to next word of pSrc1
add Q5, 8 // move Dst pointer one word over
mov Q8, [Q1]
mov Q6, Q3
mov Q7, Q5
xor Q9, Q9
mov D10, D4
ALIGN(16)
SymCryptFdef369RawMulAsmLoop2:
mov Q0, [Q6]
mul Q8
add Q0, [Q7]
adc QH, 0
add Q0, Q9
adc QH, 0
mov [Q7], Q0
mov Q9, QH
mov Q0, [Q6 + 8]
mul Q8
add Q0, [Q7 + 8]
adc QH, 0
add Q0, Q9
adc QH, 0
mov [Q7 + 8], Q0
mov Q9, QH
mov Q0, [Q6 + 16]
mul Q8
add Q0, [Q7 + 16]
adc QH, 0
add Q0, Q9
adc QH, 0
mov [Q7 + 16], Q0
mov Q9, QH
add Q6, 24
add Q7, 24
dec D10
jnz SymCryptFdef369RawMulAsmLoop2
mov [Q7], QH // write next word. (stays within Dst buffer)
dec D2
jnz SymCryptFdef369RawMulAsmLoopOuter
MUL_FUNCTION_END(SymCryptFdef369RawMulAsm)
// VOID
// SYMCRYPT_CALL
// SymCryptFdef369MontgomeryReduceAsm(
// _In_ PCSYMCRYPT_MODULUS pmMod,
// _Inout_ PUINT32 pSrc,
// _Out_ PUINT32 pDst )
MUL_FUNCTION_START(SymCryptFdef369MontgomeryReduceAsm, 3, 13)
mov D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64] // nDigits
inc D4
mov Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64] // inv64
lea Q1, [Q1 + SymCryptModulusValueOffsetAmd64] // modulus value
lea D12, [D4 + 2*D4] // outer loop counter, in words
xor D8, D8
// General register allocations
// Q0 = multiply result
// QH = multiply result
// Q1 = pointer to modulus value
// Q2 = pSrc (updated in outer loop)
// Q3 = pDst
// D4 = nDigits
// Q5 = pmMod->tm.montgomery.inv64
// Q6 = multiplier in inner loop
// Q7 = carry
// Q8 = carry out from last word of previous loop iteration
// Q9 = running pointer in Src
// Q10 = running pointer in Mod
// D11 = loop counter
// D12 = outer loop counter (words)
ALIGN(16)
SymCryptFdef369MontgomeryReduceAsmOuterLoop:
// start decoder with a few simple instructions, including at least one that requires
// a uop execution and is on the critical path
mov Q6, [Q2] // fetch word of Src we want to set to zero
mov Q10, Q2
mov Q9, Q1
imul Q6, Q5 // lower word is same for signed & unsigned multiply
mov D11, D4
xor D7, D7
ALIGN(16)
SymCryptFdef369MontgomeryReduceAsmInnerloop:
// Q0 = mul scratch
// QH = mul scratch
// Q1 = pointer to modulus value
// Q6 = multiplier
// Q7 = carry (64 bits)
// Q9 = running ptr to modulus
// Q10 = running ptr to input/scratch
// D11 = inner loop counter (digits)
// D12 = outer loop counter (words)
mov Q0, [Q9]
mul Q6
add Q0, [Q10]
adc QH, 0
add Q0, Q7
adc QH, 0
mov [Q10], Q0
mov Q7, QH
mov Q0, [Q9 + 8]
mul Q6
add Q0, [Q10 + 8]
adc QH, 0
add Q0, Q7
adc QH, 0
mov [Q10 + 8], Q0
mov Q7, QH
mov Q0, [Q9 + 16]
mul Q6
add Q0, [Q10 + 16]
adc QH, 0
add Q0, Q7
adc QH, 0
mov [Q10 + 16], Q0
mov Q7, QH
add Q9, 24
add Q10, 24
dec D11
jnz SymCryptFdef369MontgomeryReduceAsmInnerloop
add Q7, Q8
mov D8, 0
adc Q8, 0
add Q7, [Q10]
adc Q8, 0
mov [Q10], Q7
add Q2, 8
dec D12
jnz SymCryptFdef369MontgomeryReduceAsmOuterLoop
//
// Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
//
// First we compute the pSrc result minus the modulus into the destination
mov D11, D4 // loop ctr
mov Q10, Q2 // pSrc
mov Q9, Q1 // pMod
mov Q7, Q3 // pDst
// Cy = 0 because the last 'adc Q8,0' resulted in 0, 1, or 2
ALIGN(16)
SymCryptFdef369MontgomeryReduceAsmSubLoop:
mov Q0,[Q10]
sbb Q0,[Q9]
mov [Q7], Q0
mov Q0,[Q10 + 8]
sbb Q0,[Q9 + 8]
mov [Q7 + 8], Q0
mov Q0,[Q10 + 16]
sbb Q0,[Q9 + 16]
mov [Q7 + 16], Q0
lea Q10,[Q10 + 24]
lea Q9,[Q9 + 24]
lea Q7,[Q7 + 24]
dec D11
jnz SymCryptFdef369MontgomeryReduceAsmSubLoop
// Finally a masked copy form pSrc to pDst
// copy if: Q8 == 0 && Cy = 1
sbb Q8, 0 // mask (64 bits)
ALIGN(16)
SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
mov Q0, [Q2]
mov Q1, [Q3]
xor Q0, Q1
and Q0, Q8
xor Q0, Q1
mov [Q3], Q0
mov Q0, [Q2 + 8]
mov Q1, [Q3 + 8]
xor Q0, Q1
and Q0, Q8
xor Q0, Q1
mov [Q3 + 8], Q0
mov Q0, [Q2 + 16]
mov Q1, [Q3 + 16]
xor Q0, Q1
and Q0, Q8
xor Q0, Q1
mov [Q3 + 16], Q0
// Move on to the next digit
add Q2, 24
add Q3, 24
dec D4
jnz SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
MUL_FUNCTION_END(SymCryptFdef369MontgomeryReduceAsm)
FILE_END()

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,224 +0,0 @@
;
; Macros for the multiplication routines in amd64
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
; General multiplication
MULT_SINGLEADD_128 MACRO index, src_reg, dst_reg
; rax = mul scratch
; rbx = multiplier
; rdx = mul scratch
; src_reg = running ptr to input
; dst_reg = running ptr to output/scratch
; r12 = carry for even words (64 bits)
; r15 = carry for odd words (64 bits)
mov rax, [src_reg + 8*index]
mul rbx
mov r15, rdx
add rax, r12
mov [dst_reg + 8*index], rax
adc r15, 0
mov rax, [src_reg + 8*(index+1)]
mul rbx
mov r12, rdx
add rax, r15
mov [dst_reg + 8*(index+1)], rax
adc r12, 0
ENDM
MULT_DOUBLEADD_128 MACRO index, src_reg, dst_reg
; rax = mul scratch
; rbx = multiplier
; rdx = mul scratch
; src_reg = running ptr to input
; dst_reg = running ptr to output/scratch
; r12 = carry for even words (64 bits)
; r15 = carry for odd words (64 bits)
mov rax, [src_reg + 8*index]
mul rbx
mov r15, rdx
add rax, [dst_reg + 8*index]
adc r15, 0
add rax, r12
mov [dst_reg + 8*index], rax
adc r15, 0
mov rax, [src_reg + 8*(index+1)]
mul rbx
mov r12, rdx
add rax, [dst_reg + 8*(index+1)]
adc r12, 0
add rax, r15
mov [dst_reg + 8*(index+1)], rax
adc r12, 0
ENDM
; Squaring
SQR_SINGLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry
; rax = mul scratch
; rbx = multiplier
; rdx = mul scratch
; src_reg = running ptr to input
; dst_reg = running ptr to output/scratch
; src_carry = input carry
; dst_carry = output carry
mov rax, [src_reg + 8*index]
mul rbx
mov dst_carry, rdx
add rax, src_carry
mov [dst_reg + 8*index], rax
adc dst_carry, 0
ENDM
SQR_DOUBLEADD_64 MACRO index, src_reg, dst_reg, src_carry, dst_carry
; rax = mul scratch
; rbx = multiplier
; rdx = mul scratch
; src_reg = running ptr to input
; dst_reg = running ptr to output/scratch
; src_carry = input carry
; dst_carry = output carry
mov rax, [src_reg + 8*index]
mul rbx
mov dst_carry, rdx
add rax, [dst_reg + 8*index]
adc dst_carry, 0
add rax, src_carry
mov [dst_reg + 8*index], rax
adc dst_carry, 0
ENDM
SQR_SHIFT_LEFT MACRO index
mov rax, [rdi + 8*index]
adc rax, rax ; Shift let and add the carry
mov [rdi + 8*index], rax
ENDM
SQR_DIAGONAL_PROP MACRO index
;;;;;;;;;;;;;;;;;;;;;;;;
; Calculating the square
mov rax, [rsi + 8*index] ; mulword
mul rax ; m^2
; Adding the square to the even column
add rax, [rdi + 16*index]
adc rdx, 0
add rax, r12
adc rdx, 0
mov [rdi + 16*index], rax
; Propagating the sum to the next column
mov rax, rdx
xor rdx, rdx
add rax, [rdi + 16*index + 8]
adc rdx, 0
mov [rdi + 16*index + 8], rax
mov r12, rdx
ENDM
; Size-specific macros
; A common prologue & epilogue between several functions allows jumping between them...
MULT_COMMON_PROLOGUE MACRO
; We need all the registers
push_reg r12
push_reg r13
push_reg r14
push_reg r15
push_reg rdi
push_reg rsi
push_reg rbx
push_reg rbp
END_PROLOGUE
ENDM
MULT_COMMON_EPILOGUE MACRO
BEGIN_EPILOGUE
pop rbp
pop rbx
pop rsi
pop rdi
pop r15
pop r14
pop r13
pop r12
ret
ENDM
MUL14 MACRO Mult, pA, R0, R1, R2, R3, Cy
; (R0, R1, R2, R3, rdx) = Mult * (A0..3) + (R0, R1, R2, R3)
; Cy, rax = scratch
mov rax, [pA]
mul Mult
add R0, rax
adc rdx, 0
mov Cy, rdx
mov rax, [pA + 8]
mul Mult
add R1, rax
adc rdx, 0
add R1, Cy
adc rdx, 0
mov Cy, rdx
mov rax, [pA + 16]
mul Mult
add R2, rax
adc rdx, 0
add R2, Cy
adc rdx, 0
mov Cy, rdx
mov rax, [pA + 24]
mul Mult
add R3, rax
adc rdx, 0
add R3, Cy
adc rdx, 0
ENDM
; Macros for size-specific squaring
SQR_DOUBLEADD_64_2 MACRO index
SQR_DOUBLEADD_64 (index), rsi, rdi, r12, r15
SQR_DOUBLEADD_64 (index + 1), rsi, rdi, r15, r12
ENDM
SQR_DOUBLEADD_64_4 MACRO index
SQR_DOUBLEADD_64_2 (index)
SQR_DOUBLEADD_64_2 (index + 2)
ENDM
SQR_DOUBLEADD_64_8 MACRO index
SQR_DOUBLEADD_64_4 (index)
SQR_DOUBLEADD_64_4 (index + 4)
ENDM
SQR_SIZE_SPECIFIC_INIT MACRO
lea rcx, [rcx + 8] ; move Src pointer 1 word over
lea r10, [r10 + 16] ; move Dst pointer 2 words over
mov rsi, rcx ; rsi = inner pSrc
mov rdi, r10 ; rdi = inner pDst
mov rbx, [rcx] ; Get the next mulword
lea rsi, [rsi + 8] ; move Src pointer 1 word over
ENDM

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,423 +0,0 @@
;
; Sha1Asm.Asm
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
;
;
; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
; for the x64 processor architecture.
;
; This implementation is derived from the 32-bit one, which in turn is derived
; from an older one by Scott Field and Dan Shumow.
;
include ksamd64.inc
TITLE sha1asm.asm
;
; The four round constants used by SHA-1
;
K0_19 EQU 05a827999H
K20_39 EQU 06ed9eba1H
K40_59 EQU 08f1bbcdcH
K60_79 EQU 0ca62c1d6H
;VOID
;SYMCRYPT_CALL
;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H,
; _In_reads_bytes_( cbData ) PCBYTE pbData,
; SIZE_T cbData )
;
;
; This function allocates stack space, so it is not a LEAF function
; but a nested one.
;
NESTED_ENTRY SymCryptSha1AppendBlocksAsm, _TEXT
;
; To keep stack manipulations simple we define a structure and use that for all accesses.
;
SymCryptSha1AppendBlocksFrame struct 16, NONUNIQUE
;
; To keep the RSP aligned we need (8 mod 16) bytes of local stack space.
; this is the case, so there is no need for a dummy location
;
Wbuf dd 16 dup (?)
EndAddress dq ?
SaveR12 dq ?
SaveR13 dq ?
SaveR14 dq ?
SaveR15 dq ?
SaveRdi dq ?
SaveRsi dq ?
SaveRbp dq ?
SaveRbx dq ?
ReturnAddress dq ?
CallerP1Home dq ?
CallerP2Home dq ?
CallerP3Home dq ?
CallerP4Home dq ?
SymCryptSha1AppendBlocksFrame ends
;
; We use the W buffer extensively; this is a shorthand for the base address
;
W equ rsp+SymCryptSha1AppendBlocksFrame.Wbuf
;
; Set up our stack frame and save non-volatile registers
;
rex_push_reg rbx
push_reg rbp
push_reg rsi
push_reg rdi
push_reg r15
push_reg r14
push_reg r13
push_reg r12
alloc_stack SymCryptSha1AppendBlocksFrame.SaveR12
END_PROLOGUE
;
;Register allocation:
;
;5 registers for state
;2 scratch
;6 registers for W[t-1], W[t-2], W[t-3], W[t-14], W[t-15], W[t-16]
;1 for data pointer
;1 for H pointer
;
;
; To allow macro re-ordering of our registers we use symbolic names
; for the registers.
; s0-s4 are the 5 state registers. x1 and x2 are extra scratch registers.
; w0-w5 contain the W state cache
;
; Note: some other code puts the right value in the right register and
; has to be updated if this mapping is changed.
;
; a is in register (round % 5)
; b is in register (round+4 % 5)
; c is in register (round+3 % 5)
; d is in register (round+2 % 5)
; e is in register (round+1 % 5)
; This way, if round is incremented we move a->b, b->c, c->d, d->e, and e->a
; For optimization the actual value of a is in scratch register x1 at the start of each round
;
; W[t- 1] is in register (round % 6)
; W[t- 2] is in register (round+5 % 6)
; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
; W[t-14] is in register (round+3 % 6)
; W[t-15] is in register (round+2 % 6)
; W[t-16] is in register (round+1 % 6)
; If round is incremented the values all appear in their right place.
s0 EQU eax
s1 EQU ebx
s2 EQU ecx
s3 EQU edx
s4 EQU esi
w0 EQU r9d
w1 EQU r10d
w2 EQU r11d
w3 EQU r12d
w4 EQU r13d
w5 EQU r14d
x1 EQU ebp ; screatch 1
x2 EQU edi ; scratch 2
dataPtr EQU r8 ; Points to data buffer
HPtr EQU r15 ; Points to H
; At this point:
; rcx = H
; rdx = pbData
; r8 = cbData
;
; compute the end address, address of byte after last block we will process
; This code ensures that we never exceed the data buffer we were given,
; although we silently round the cbData parameter down to the next
; multiple of 64.
; Do nothing if no blocks need to be processed.
;
and r8,NOT 3fh ; round down to multiple of 64
jz SymCryptSha1AppendBlocksDone
add r8,rdx ; pbData + (cbData & 0x3f)
mov [rsp+SymCryptSha1AppendBlocksFrame.EndAddress], r8
mov dataPtr,rdx
mov Hptr,rcx
;
; Load the H state, note that the a value lives in x1 at the round code boundary
;
mov x1,[Hptr ]
mov s4,[Hptr+ 4]
mov s3,[Hptr+ 8]
mov s2,[Hptr+12]
mov s1,[Hptr+16]
SymCryptSha1AppendBlocksLoop:
;
; This is the main loop. We process 64 bytes in each iteration.
;
; Most of the code in the loop is generated through macros using parameters to
; rename the registers.
;
ROUND_CH_0_15 MACRO round,sa,sb,sc,sd,se,wt,x1,x2
;
; Code for round 0-15.
; This code loads data from the data buffer & BSWAPs the data to get it into the
; right form.
;
; Parameters:
; round round number
; sa register that will contain the a value
; sb register that contains the b value
; sc register that contains the c value
; sd register that contains the d value
; se register that contains the e value
; x1 scratch, contains the a value on entry
; x2 scratch register.
; wt register loaded with Wt
;
; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
; We start with the d value as that is the oldest value and available the first
;
; See FIPS 180-2 for our symbolic notation.
;
mov x2,sd ; x2 = d
mov wt,[dataPtr+4*round] ; Fetch word from message
mov sa, x1 ; put a in the correct register
bswap wt ; wt = Wt
xor x2,sc ; x2 = (d ^ c)
rol x1,5 ; x1 = ROL(a,5)
add se,wt ; se = e + Wt
and x2,sb ; x2 = ((d ^ c) & b)
mov [W + 4*round],wt ; Store in W buffer for future use
ror sb,2 ; sb = ROL( b, 30 )
add se,x1 ; se = e + Wt + ROL(a,5)
xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
lea x1,[se+x2+K0_19] ; x1 = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
ENDM
MSG_EXP MACRO round, se, wa, wb, wc
; round round number
; se register of state to add expanded message word to
; wa register of W[round-16], will be updated to contain W[round]
; wb register of W[round-14]
; wc register of W[round- 3], will be loaded with W[round-13]
xor wc, wb ; wc = W[t-3] ^ W[t-14]
xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8]
xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
rol wa,1 ; wa = Wt
IF round LT (80 - 1)
; do not load wc with W[t-13] in the last round; it will not be needed
mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13]
ENDIF
add se,wa ; re = e + Wt
IF round LT (80 - 8)
; don't store Wt in the last 8 rounds. The value would never be used
mov [W+4*(round MOD 16)], wa; Store Wt
ENDIF
ENDM
ROUND_CH MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
;
; See ROUND_CH_0_15 for most parameters.
; x1 and x2 are both scratch registers
; wa register of W[round-16], will be updated to contain W[round]
; wb register of W[round-14]
; wc register of W[round- 3], will be loaded with W[round-13]
;
xor wc, wb ; wc = W[t-3] ^ W[t-14]
xor wa,[W+4*((round-8) MOD 16)] ; wa = W[t-16] ^ W[t-8]
xor wa, wc ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
rol wa,1 ; wa = Wt
mov wc,[W+4*((round-13) MOD 16)] ; wc = W[t-13]
add se,wa ; re = e + Wt
mov [W+4*(round MOD 16)], wa ; Store Wt
mov sa, x1 ; put a in the correct register
mov x2,sd ; x2 = d
rol x1,5 ; x1 = ROL(a,5)
xor x2,sc ; x2 = (d ^ c)
add se,x1 ; re = e + Wt + ROL(a,5)
and x2,sb ; x2 = ((d ^ c) & b)
ror sb,2 ; rb = ROL( b, 30 )
xor x2,sd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
lea x1,[se+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
ENDM
ROUND_PARITY MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2, K
;
; See ROUND_CH for most parameters
; K is the round constant to use.
;
; The order of xorring the registers b, c, and d is driven by the data dependency graph.
; We start with d (the oldest) and then do b to unblock the subsequent rotate
;
MSG_EXP round, se, wa, wb, wc ; re = e + Wt
mov sa,x1 ; store a value in right register
rol x1,5 ; x1 = ROL(a,5)
add se,x1 ; re = e + Wt + ROL(a,5)
mov x2,sd ; x1 = d
xor x2,sb ; x1 = (d ^ b)
xor x2,sc ; x1 = (d ^ b ^ c) = Parity(b,c,d)
ror sb,2 ; rb = ROL( b, 30 )
lea x1,[se+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
ENDM
ROUND_MAJ MACRO round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
;
; See above for parameter explanation
;
MSG_EXP round, se, wa, wb, wc ; re = e + Wt
mov sa,x1 ; store a value in right register
rol x1,5 ; x1 = ROL(a,5)
add se,x1 ; re = e + ROL(a,5)
mov x1,sd ; x1 = d
or x1,sc ; x1 = (d | c)
and x1,sb ; x1 = ((d | c) & b)
mov x2,sc ; x2 = c
and x2,sd ; x2 = (c & d)
or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
ror sb,2 ; rb = ROL( b, 30 )
lea x1,[se+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
ENDM
;
; With these macros we can now produce the actual code.
; Note the use of the % operator which evaluates the expression and yields the result as text.
; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
; for each round.
;
; The first 16 rounds are more complicated as we need to use the right registers to load the msg in
; so we do those by hand
;
; W[t- 1] is in register (round % 6)
; W[t- 2] is in register (round+5 % 6)
; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
; W[t-14] is in register (round+3 % 6)
; W[t-15] is in register (round+2 % 6)
; W[t-16] is in register (round+1 % 6)
;
ROUND_CH_0_15 0, s0, s4, s3, s2, s1, w5, x1, x2 ;W[t-16] for t=16 is in w5
ROUND_CH_0_15 1, s1, s0, s4, s3, s2, w0, x1, x2 ;W[t-15] for t=16 is in w0
ROUND_CH_0_15 2, s2, s1, s0, s4, s3, w1, x1, x2 ;W[t-14] for t=16 is in w1
ROUND_CH_0_15 3, s3, s2, s1, s0, s4, w3, x1, x2 ;
ROUND_CH_0_15 4, s4, s3, s2, s1, s0, w4, x1, x2 ;
ROUND_CH_0_15 5, s0, s4, s3, s2, s1, w3, x1, x2 ;
ROUND_CH_0_15 6, s1, s0, s4, s3, s2, w4, x1, x2 ;
ROUND_CH_0_15 7, s2, s1, s0, s4, s3, w3, x1, x2 ;
ROUND_CH_0_15 8, s3, s2, s1, s0, s4, w4, x1, x2 ;
ROUND_CH_0_15 9, s4, s3, s2, s1, s0, w3, x1, x2 ;
ROUND_CH_0_15 10, s0, s4, s3, s2, s1, w4, x1, x2 ;
ROUND_CH_0_15 11, s1, s0, s4, s3, s2, w3, x1, x2 ;
ROUND_CH_0_15 12, s2, s1, s0, s4, s3, w4, x1, x2 ;
ROUND_CH_0_15 13, s3, s2, s1, s0, s4, w2, x1, x2 ;W[t-3] for t=16 is in w2
ROUND_CH_0_15 14, s4, s3, s2, s1, s0, w3, x1, x2 ;W[t-2] for t=16 is in w3
ROUND_CH_0_15 15, s0, s4, s3, s2, s1, w4, x1, x2 ;W[t-1] for t=16 is in w4
FOR t, <16, 17, 18, 19>
ROUND_CH t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
ENDM
FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K20_39
ENDM
FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
ROUND_MAJ t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
ENDM
FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79>
ROUND_PARITY t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K60_79
ENDM
;
; Now we update the state, & the dataPtr
;
add x1,[Hptr ]
add s4,[Hptr+ 4]
add dataPtr,64
add s3,[Hptr+ 8]
add s2,[Hptr+12]
add s1,[Hptr+16]
mov [Hptr ], x1
mov [Hptr+ 4], s4
cmp dataPtr,[rsp+SymCryptSha1AppendBlocksFrame.EndAddress] ; Loop terminating condition
mov [Hptr+ 8], s3
mov [Hptr+12], s2
mov [Hptr+16], s1
jc SymCryptSha1AppendBlocksLoop ; Main loop
;
; We're done processing the blocks. The result is already in the state, so all we have to do
; is clean up.
;
; Wipe the W buffer
; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
;
mov rcx,64
xor rax,rax
@@: sub ecx,16
mov [rsp+rcx ],rax
mov [rsp+rcx+8],rax
jnz @B
SymCryptSha1AppendBlocksDone:
add rsp, SymCryptSha1AppendBlocksFrame.SaveR12
BEGIN_EPILOGUE
pop r12
pop r13
pop r14
pop r15
pop rdi
pop rsi
pop rbp
pop rbx
ret
NESTED_END SymCryptSha1AppendBlocksAsm, _TEXT
END

Просмотреть файл

@ -1,37 +0,0 @@
;
; SymCrypt_magic.inc
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
; Include file to define the support macros for the Magic field
;
extern SymCryptFatal:NEAR
SYMCRYPT_MAGIC_FIELD MACRO
if DBG
magic dq ?
endif
ENDM
SYMCRYPT_CODE_VERSION EQU ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR )
SYMCRYPT_MAGIC_CONSTANT EQU ('S1mv' + SYMCRYPT_CODE_VERSION)
SYMCRYPT_CHECK_MAGIC MACRO ptr, struct_name
if DBG
mov rax, [ptr + struct_name.magic]
sub rax, ptr
cmp rax, SYMCRYPT_MAGIC_CONSTANT
jz @F
mov ecx, 'magc'
call SymCryptFatal
@@:
endif
ENDM

Просмотреть файл

@ -1,171 +0,0 @@
;
; Wipe.asm
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
include ksamd64.inc
TITLE wipe.asm
;VOID
;SYMCRYPT_CALL
;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
; SIZE_T cbData )
;
; This function allocates no stack space, calls no functions, and does not save
; any non-volatile registers. Thusm it is a LEAF function
;
LEAF_ENTRY SymCryptWipeAsm, _TEXT
; rcx = pbData
; rdx = cbData
;
; This function will handle any alignment of pbData and any size, but it is optimized for
; the case where the start and end of the buffer are 16-aligned.
; 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
; of 16 long without adding too much slack.
; The cost of non-alignment is relatively low, in the order of 5 cycles or so
;
xorps xmm0,xmm0 ; Zero register for 16-byte wipes
cmp rdx,16
jb SymCryptWipeAsmSmall ; if cbData < 16, this is a rare case
test rcx,15
jnz SymCryptWipeAsmUnAligned; if data pointer is unaligned, we jump to the code that aligns the pointer
; For well-optimized callers the aligned case is the common one, and that is
; the fall-through.
SymCryptWipeAsmAligned:
;
; Here rcx is aligned, and rdx contains the # bytes left to wipe, and rdx >= 16
;
; Our loop wipes in 32-byte increments; we always wipe the first 16 bytes if
; and increment the pbData pointer if cbData is 16 mod 32
; This avoids a conditional jump and is faster.
;
test rdx,16
movaps [rcx],xmm0 ; it is safe to always wipe as cbData >= 16
lea r8,[rcx+16]
cmovnz rcx,r8 ; only increment pbData if cbData = 16 mod 32
sub rdx,32 ; see if we have >= 32 bytes to wipe
jc SymCryptWipeAsmTailOptional ; if not, wipe tail, or nothing if cbData = 0 mod 16
align 16
SymCryptWipeAsmLoop:
movaps [rcx],xmm0
movaps [rcx+16],xmm0 ; Wipe 32 bytes
add rcx,32
sub rdx,32
jnc SymCryptWipeAsmLoop
SymCryptWipeAsmTailOptional:
; only the lower 4 bits of rdx are valid, we have subtracted too much already.
; The wipe was at least 16 bytes, so we can just wipe the tail in one instruction
and edx,15
jnz SymCryptWipeAsmTail
ret
SymCryptWipeAsmTail:
; This code appears also below at the end of the unaligned wiping routine
; but making the jnz jump further is slower and we only duplicate 4 instructions.
xor eax,eax
mov [rcx+rdx-16],rax
mov [rcx+rdx-8],rax
ret
align 4
SymCryptWipeAsmUnaligned:
;
; At this point we know that cbData(rdx) >= 16 and pbData(rcx) is unaligned.
; We can wipe 16 bytes and move to an aligned position
;
xor eax,eax
mov [rcx],rax
mov [rcx+8],rax
mov eax,ecx ;
neg eax ; lower 4 bits of eax = # bytes to wipe to reach alignment
and eax,15
add rcx,rax
sub rdx,rax
;
; If rdx > 16, go to the aligned wiping loop
;
cmp rdx,16
jae SymCryptWipeAsmAligned ; if cbData >= 16, do aligned wipes
;
; We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
; We just wipe the last 16 bytes completely.
;
xor eax,eax
mov [rcx+rdx-16],rax
mov [rcx+rdx-8],rax
ret
align 8
SymCryptWipeAsmSmall:
; rcx = pbData, possibly unaligned
; rdx = cbData; rdx < 16
;
; With speculative execution attacks, the cost of a jump table is prohibitive.
; We use a compare ladder for 5 cases:
; 8-15 bytes
; 4-7 bytes
; 2-3 bytes
; 1 byte
; 0 bytes
xor eax,eax
cmp edx, 8
jb SymCryptWipeAsmSmallLessThan8
; wipe 8-15 bytes using two possibly overlapping writes
mov [rcx], rax
mov [rcx + rdx - 8], rax
ret
SymCryptWipeAsmSmallLessThan8:
cmp edx, 4
jb SymCryptWipeAsmSmallLessThan4
; wipe 4-7 bytes
mov [rcx], eax
mov [rcx + rdx - 4], eax
ret
SymCryptWipeAsmSmallLessThan4:
cmp edx, 2
jb SymCryptWipeAsmSmallLessThan2
; wipe 2-3 bytes
mov [rcx], ax
mov [rcx + rdx - 2], ax
ret
SymCryptWipeAsmSmallLessThan2:
or edx, edx
jz SymCryptWipeAsmSmallDone
; wipe 1 byte
mov [rcx], al
SymCryptWipeAsmSmallDone:
ret
LEAF_END SymCryptWipeAsm, _TEXT
END

165
lib/amd64/wipe.symcryptasm Normal file
Просмотреть файл

@ -0,0 +1,165 @@
//
// wipe.symcryptasm Assembler code for wiping a buffer
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
#include "symcryptasm_shared.cppasm"
//VOID
//SYMCRYPT_CALL
//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
// SIZE_T cbData )
FUNCTION_START(SymCryptWipeAsm, 2, 4)
// Q1 = pbData
// Q2 = cbData
//
// This function will handle any alignment of pbData and any size, but it is optimized for
// the case where the start and end of the buffer are 16-aligned.
// 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
// of 16 long without adding too much slack.
// The cost of non-alignment is relatively low, in the order of 5 cycles or so
//
xorps xmm0,xmm0 // Zero register for 16-byte wipes
cmp Q2,16
jb SymCryptWipeAsmSmall // if cbData < 16, this is a rare case
test Q1,15
jnz SymCryptWipeAsmUnaligned // if data pointer is unaligned, we jump to the code that aligns the pointer
// For well-optimized callers the aligned case is the common one, and that is
// the fall-through.
SymCryptWipeAsmAligned:
//
// Here Q1 is aligned, and Q2 contains the # bytes left to wipe, and Q2 >= 16
//
// Our loop wipes in 32-byte increments; we always wipe the first 16 bytes
// and increment the pbData pointer if cbData is 16 mod 32
// This avoids a conditional jump and is faster.
//
test Q2,16
movaps [Q1],xmm0 // it is safe to always wipe as cbData >= 16
lea Q3,[Q1+16]
cmovnz Q1,Q3 // only increment pbData if cbData = 16 mod 32
sub Q2,32 // see if we have >= 32 bytes to wipe
jc SymCryptWipeAsmTailOptional // if not, wipe tail, or nothing if cbData = 0 mod 16
ALIGN(16)
SymCryptWipeAsmLoop:
movaps [Q1],xmm0
movaps [Q1+16],xmm0 // Wipe 32 bytes
add Q1,32
sub Q2,32
jnc SymCryptWipeAsmLoop
SymCryptWipeAsmTailOptional:
// only the lower 4 bits of Q2 are valid, we have subtracted too much already.
// The wipe was at least 16 bytes, so we can just wipe the tail with 2 instructions
and D2,15
jnz SymCryptWipeAsmTail
ret
SymCryptWipeAsmTail:
// This code appears also below at the end of the unaligned wiping routine
// but making the jnz jump further is slower and we only duplicate 4 instructions.
xor D0,D0
mov [Q1+Q2-16],Q0
mov [Q1+Q2-8],Q0
ret
ALIGN(4)
SymCryptWipeAsmUnaligned:
//
// At this point we know that cbData(Q2) >= 16 and pbData(Q1) is unaligned.
// We can wipe 16 bytes and move to an aligned position
//
xor D0,D0
mov [Q1],Q0
mov [Q1+8],Q0
mov D0,D1
neg D0 // lower 4 bits of D0 = # bytes to wipe to reach alignment
and D0,15
add Q1,Q0
sub Q2,Q0
//
// If Q2 > 16, go to the aligned wiping loop
//
cmp Q2,16
jae SymCryptWipeAsmAligned // if cbData >= 16, do aligned wipes
//
// We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
// We just wipe the last 16 bytes completely.
//
xor D0,D0
mov [Q1+Q2-16],Q0
mov [Q1+Q2-8],Q0
ret
ALIGN(8)
SymCryptWipeAsmSmall:
// Q1 = pbData, possibly unaligned
// Q2 = cbData; Q2 < 16
//
// With speculative execution attacks, the cost of a jump table is prohibitive.
// We use a compare ladder for 5 cases:
// 8-15 bytes
// 4-7 bytes
// 2-3 bytes
// 1 byte
// 0 bytes
xor D0,D0
cmp D2, 8
jb SymCryptWipeAsmSmallLessThan8
// wipe 8-15 bytes using two possibly overlapping writes
mov [Q1],Q0
mov [Q1+Q2-8],Q0
ret
SymCryptWipeAsmSmallLessThan8:
cmp D2, 4
jb SymCryptWipeAsmSmallLessThan4
// wipe 4-7 bytes
mov [Q1],D0
mov [Q1+Q2-4],D0
ret
SymCryptWipeAsmSmallLessThan4:
cmp D2, 2
jb SymCryptWipeAsmSmallLessThan2
// wipe 2-3 bytes
mov [Q1],W0
mov [Q1+Q2-2],W0
ret
SymCryptWipeAsmSmallLessThan2:
or D2,D2
jz SymCryptWipeAsmSmallDone
// wipe 1 byte
mov [Q1],B0
SymCryptWipeAsmSmallDone:
FUNCTION_END(SymCryptWipeAsm)
FILE_END()

Просмотреть файл

@ -9,7 +9,11 @@
#include "symcrypt_version.inc"
#include "symcrypt_magic.inc"
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
#define SYMCRYPT_MASM
#include "C_asm_shared.inc"
#undef SYMCRYPT_MASM
; A digit consists of 4 words of 32 bits each
@ -449,11 +453,11 @@ SymCryptFdefRawSquareAsmInnerLoopInit_Word1
SQR_SINGLEADD_32 3
add r2, r2, #16
add r4, r4, #16
adds r3, r3, #1 ; move one digit up
adds r3, r3, #1 ; move one digit up
bne SymCryptFdefRawSquareAsmInnerLoopInit_Word0
str r11, [r4] ; Store the next word into the destination
@ -689,7 +693,7 @@ SymCryptFdefMontgomeryReduceAsmInner
adds r11, r11, r7 ; c + pSrc[nWords] + hc
adc r8, r8, #0 ; Add the carry if any
str r11, [r1], #4 ; pSrc[nWords] = c
adds r12, r12, r6 ; c + pSrc[nWords+1]
adc r9, r9, #0 ; Add the carry if any
adds r12, r12, r8 ; c + pSrc[nWords] + hc
@ -701,7 +705,7 @@ SymCryptFdefMontgomeryReduceAsmInner
add r2, r2, #8 ; Move stored pSrc pointer two words up
ldr r0, [sp, #pMod] ; Restore the pMod pointer
mov r1, r2 ; Restore the pSrc pointer
bne SymCryptFdefMontgomeryReduceAsmOuter
;

Просмотреть файл

@ -16,7 +16,11 @@
#include "symcrypt_name_mangling.inc"
#include "symcrypt_magic.inc"
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
#define SYMCRYPT_MASM
#include "C_asm_shared.inc"
#undef SYMCRYPT_MASM
; A digit consists of 3 words of 64 bits each
@ -213,7 +217,7 @@ SymCryptFdef369RawMulAsmLoopInner1
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
str x12, [x4], #8 ; Store to destination
cbnz x3, SymCryptFdef369RawMulAsmLoopInner1
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)

Просмотреть файл

@ -10,7 +10,11 @@
#include "symcrypt_name_mangling.inc"
#include "symcrypt_magic.inc"
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
#define SYMCRYPT_MASM
#include "C_asm_shared.inc"
#undef SYMCRYPT_MASM
; A digit consists of 4 words of 64 bits each

Просмотреть файл

@ -517,11 +517,11 @@ SymCryptFdefIntSetValueUint64(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptFdefRawSetValue(
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
_Out_writes_(nWords) PUINT32 pDst,
UINT32 nDigits )
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
_Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst,
UINT32 nDigits )
{
SYMCRYPT_ERROR scError;
UINT32 b;
@ -611,11 +611,11 @@ SymCryptFdefIntSetValue(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptFdefRawGetValue(
_In_reads_(nWords) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format )
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format )
{
SYMCRYPT_ERROR scError;
UINT32 b;

Просмотреть файл

@ -722,11 +722,11 @@ SymCryptFdefIntSquare(
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulC(
_In_reads_(nWords1) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nWords2) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
_In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
{
UINT32 nWords1 = nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32;
UINT32 nWords2 = nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32;
@ -778,9 +778,9 @@ SymCryptFdefRawMul(
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareC(
_In_reads_(nWords) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst )
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
{
UINT32 nWords = nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32;

Просмотреть файл

@ -1223,7 +1223,7 @@ SymCryptFdefModMulMontgomery(
SymCryptFdefMontgomeryReduce( pmMod, pTmp, &peDst->d.uint32[0] );
}
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
#if SYMCRYPT_CPU_AMD64
VOID
SYMCRYPT_CALL
SymCryptFdefModMulMontgomeryMulx(
@ -1283,7 +1283,7 @@ SymCryptFdefModSquareMontgomery(
}
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
#if SYMCRYPT_CPU_AMD64
VOID
SYMCRYPT_CALL
SymCryptFdefModSquareMontgomeryMulx(
@ -1356,70 +1356,12 @@ SymCryptFdefModInvMontgomery(
return scError;
}
#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
#if SYMCRYPT_CPU_AMD64
//=====================================
// 256-bit Montgomery modulus code
//
VOID
SYMCRYPT_CALL
SymCryptFdefModAdd256Test(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PCSYMCRYPT_MODELEMENT peSrc1,
_In_ PCSYMCRYPT_MODELEMENT peSrc2,
_Out_ PSYMCRYPT_MODELEMENT peDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch )
{
SYMCRYPT_ASYM_ALIGN BYTE buf1[128];
SYMCRYPT_ASYM_ALIGN BYTE buf2[128];
PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
(VOID) peTmp1;
(VOID) peTmp2;
SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
SymCryptFdefModAddGeneric( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch );
if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
{
SymCryptFatal( 42 );
}
SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
}
VOID
SYMCRYPT_CALL
SymCryptFdefModMulMontgomery256Test(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PCSYMCRYPT_MODELEMENT peSrc1,
_In_ PCSYMCRYPT_MODELEMENT peSrc2,
_Out_ PSYMCRYPT_MODELEMENT peDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch )
{
SYMCRYPT_ASYM_ALIGN BYTE buf1[128];
SYMCRYPT_ASYM_ALIGN BYTE buf2[128];
PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
(VOID) peTmp1;
(VOID) peTmp2;
SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
//SymCryptFdefModMulMontgomery( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); *** This doesn't produce the same result as it reduces a whole digit, not 256 bits
if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
{
// SymCryptFatal( 42 );
}
SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
}
VOID
SYMCRYPT_CALL
SymCryptFdefModSquareMontgomery256(

Просмотреть файл

@ -1,5 +1,6 @@
;
; fdef_asm.asm Assembler code for fast arithmetic
; fdef_asm.cppasm Assembler code for fast arithmetic
; Requires C preprocessor to correctly include C_asm_shared.inc
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
@ -11,9 +12,9 @@
;
; FPO documentation:
; The .FPO provides debugging information.
; This stuff not well documented,
; This stuff not well documented,
; but here is the information I've gathered about the arguments to .FPO
;
;
; In order:
; cdwLocals: Size of local variables, in DWords
; cdwParams: Size of parameters, in DWords. Given that this is all about
@ -23,7 +24,7 @@
; prolog code with work for better performance. Most uses of
; .FPO seem to set this value to 0 anyway, which is what we
; will do.
; cbRegs : # registers saved in the prolog.
; cbRegs : # registers saved in the prolog.
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
; cbFrame : Type of frame.
; 0 = FPO frame (no frame pointer)
@ -43,7 +44,7 @@ _TEXT SEGMENT PARA PUBLIC USE32 'CODE'
include symcrypt_version.inc
include symcrypt_magic.inc
include C_asm_shared.inc
#include "C_asm_shared.inc"
PUBLIC @SymCryptFdefRawAddAsm@16
PUBLIC @SymCryptFdefRawSubAsm@16
@ -60,7 +61,7 @@ BEFORE_PROC MACRO
;
DB 5 dup (0cch)
ENDM
@ -86,7 +87,7 @@ pDst dd ?
nDigits dd ?
SymCryptFdefRawAddAsmFrame ends
; ecx = pSrc1
; edx = pSrc2
@ -129,7 +130,7 @@ SymCryptFdefRawAddAsmLoop:
pop edi
pop ebx
ret 8
@SymCryptFdefRawAddAsm@16 ENDP
@ -154,7 +155,7 @@ pDst dd ?
nDigits dd ?
SymCryptFdefRawSubAsmFrame ends
; ecx = pSrc1
; edx = pSrc2
@ -197,7 +198,7 @@ SymCryptFdefRawSubAsmLoop:
pop edi
pop ebx
ret 8
@SymCryptFdefRawSubAsm@16 ENDP
@ -305,8 +306,8 @@ SymCryptFdefRawMulAsmFrame ends
; for each word in Src1:
; Dst += Src2 * word
; Register assignments
;
; eax = tmp/lower half of mult
;
; eax = tmp/lower half of mult
; ebx = multiplicant
; ecx = loop counter, initialized to nDigits2
; edx = upper half of mult
@ -315,7 +316,7 @@ SymCryptFdefRawMulAsmFrame ends
; ebp = carry
;
; esp + pSrc1 running pointer into Src1
; esp +
; esp +
mov edi,edi
@ -436,7 +437,7 @@ SymCryptFdefRawMulAsmLoop2:
adc edx, 0
mov [edi + 12], eax
mov ebp, edx
add esi, 16
add edi, 16
sub ecx,1
@ -477,7 +478,7 @@ SymCryptFdefMontgomeryReduceAsmFrame struct 4, NONUNIQUE
HighCarry dd ?
pSrc dd ?
pModValue dd ?
nWords dd ?
nWords dd ?
SaveEbp dd ? ; # words still to process in outer loop
SaveEsi dd ?
SaveEdi dd ?
@ -513,13 +514,13 @@ SymCryptFdefMontgomeryReduceAsmFrame ends
SymCryptFdefMontgomeryReduceOuterLoop:
; eax = <undef>
; ebx = <undef>
; ecx = <undef>
; ecx = <undef>
; edx = <undef>
; esi = start of mod value
; edi = pSrc + 4 * loop iteration count
; ebp = <undef>
; compute multiplier for this outer loop iteration.
; compute multiplier for this outer loop iteration.
mov ebx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusMontgomeryInv64OffsetX86 ]
imul ebx, [edi] ; word we want to zero out, ebx = multiplier for this inner loop
@ -529,7 +530,7 @@ SymCryptFdefMontgomeryReduceOuterLoop:
SymCryptFdefMontgomeryReduceInnerLoop:
; eax = mul scratch
; ebx = multiplier
; ecx = digit counter
; ecx = digit counter
; edx = mul scratch
; esi = running pointer to mod value
; edi = running pointer to input/scratch
@ -570,7 +571,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
adc edx, 0
mov [edi + 12], eax
mov ebp, edx
add esi, 16
add edi, 16
sub ecx,1
@ -606,7 +607,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
mov ecx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusNdigitsOffsetX86] ; loop counter
mov edx, [esp + SymCryptFdefMontgomeryReduceAsmFrame.pDst];
; ecx = nDigits
; Save some values for the copy loop

Просмотреть файл

@ -1,314 +0,0 @@
;
; rc4asm.asm
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
; RC4 implementation in x86 assembler
; This is a new RC4 implementation for SymCrypt.
; It is NOT based on the existing one in RSA32.lib.
;
TITLE "RC4"
.586P
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
ASSUME CS:_TEXT, DS:FLAT, SS:FLAT
include symcrypt_version.inc
include symcrypt_magic.inc
;
; Structure definition that mirrors the SYMCRYPT_RC4_STATE struct
;
RC4_STATE struct
S db 256 dup (?)
i db ?
j db ?
SYMCRYPT_MAGIC_FIELD
RC4_STATE ends
PUBLIC @SymCryptRc4InitAsm@12
PUBLIC @SymCryptRc4CryptAsm@16
BEFORE_PROC MACRO
;
; Our current x86 compiler inserts 5 0xcc bytes before every function
; and starts every function with a 2-byte NOP.
; This supports hot-patching.
;
DB 5 dup (0cch)
ENDM
; The .FPO provides debugging information.
; This stuff not well documented,
; but here is the information I've gathered about the arguments to .FPO
;
; In order:
; cdwLocals: Size of local variables, in DWords
; cdwParams: Size of parameters, in DWords. Given that this is all about
; stack stuff, I'm assuming this is only about parameters passed
; on the stack.
; cbProlog : Number of bytes in the prolog code. We have interleaved the
; prolog code with work for better performance. Most uses of
; .FPO seem to set this value to 0 anyway, which is what we
; will do.
; cbRegs : # registers saved in the prolog. 4 in our case
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
; cbFrame : Type of frame.
; 0 = FPO frame (no frame pointer)
; 1 = Trap frame (result of a CPU trap event)
; 2 = TSS frame
;
; Having looked at various occurrences of .FPO in the Windows code it
; seems to be used fairly sloppy, with lots of arguments left 0 even when
; they probably shouldn't be according to the spec.
;
BEFORE_PROC
@SymCryptRc4InitAsm@12 PROC
;VOID
;SYMCRYPT_CALL
;SymCryptRc4InitAsm(
; _Out_ PSYMCRYPT_RC4_STATE pState,
; _In_reads_bytes_( cbKey ) PCBYTE pbKey,
; _In_ SIZE_T cbKey );
;
; NOTE: Unlike the SymCryptRc4Init function
; this function does not check the cbKey validity, and does not return an error code.
; Currently we don't have the error code values symbolically in the asm environment.
; We use an inlined function to generate the errors instead, and call this function
; only when there are no errors.
;
Rc4InitFrame struct 4, NONUNIQUE
pbKey dd ?
SaveEdi dd ?
SaveEsi dd ?
SaveEbp dd ?
SaveEbx dd ?
ReturnAddress dd ?
cbKey dd ?
Rc4InitFrame ends
.FPO(5,1,0,4,0,0)
; ecx = pState
; edx = pKey
; [esp + 4] = cbKey
;
; Set up stack frame, and initialize pbKey
;
mov edi,edi ; 2-byte NOP for hot-patching
push ebx
push ebp
push esi
push edi
push edx
;
; Initialize S[i] = i
;
lea esi,[ecx + 100h]
mov edi,ecx
mov eax,03020100h
mov ebx,04040404h
@@:
mov [edi],eax
add eax,ebx
mov [edi+4],eax
add eax,ebx
mov [edi+8],eax
add eax,ebx
mov [edi+12],eax
add eax,ebx
add edi,16
cmp edi,esi
jb @B
mov ebp,edx
xor ebx,ebx ; j = 0
xor esi,esi ; i = 0
mov edi,[esp + Rc4InitFrame.cbKey]
add edi, edx ; edi = pbKey + cbKey
SymCryptRc4InitMainLoop:
; Registers:
; eax = Tmp1
; ebx = j
; ecx = S
; edx = Tmp2
; esi = i
; edi = keyLimit ; just beyond the key
; ebp = pKey ; pointer to current key byte
movzx edx,byte ptr[ebp] ; get key byte
add ebx,edx ; j += key byte
movzx eax,byte ptr[ecx + esi] ; get S[i]
add ebx,eax ; j += S[i]
and ebx,0ffh
movzx edx,byte ptr [ecx + ebx]; get S[j]
mov byte ptr[ecx + ebx], al ; update S[j]
mov byte ptr[ecx + esi], dl ; update S[i]
add ebp,1 ; increment key pointer modulo key length
cmp ebp,edi
jb @F
mov ebp,[esp + Rc4InitFrame.pbKey]
@@:
add esi,1 ; increment i
cmp esi,100h
jb SymCryptRc4InitMainLoop
mov word ptr [ecx + RC4_STATE.i], 1 ; i = 1; j = 0
add esp,4
pop edi
pop esi
pop ebp
pop ebx
ret 4
@SymCryptRc4InitAsm@12 ENDP
BEFORE_PROC
@SymCryptRc4CryptAsm@16 PROC
;VOID
;SYMCRYPT_CALL
;SymCryptRc4Crypt(
; _Inout_ PSYMCRYPT_RC4_STATE pState,
; _In_reads_bytes_( cbData ) PCBYTE pbSrc,
; _Out_writes_bytes_( cbData ) PBYTE pbDst,
; _In_ SIZE_T cbData )
Rc4CryptFrame struct 4, NONUNIQUE
pbEndDst dd ?
SaveEdi dd ?
SaveEsi dd ?
SaveEbp dd ?
SaveEbx dd ?
ReturnAddress dd ?
pbDst dd ?
cbData dd ?
Rc4CryptFrame ends
.FPO(5,2,0,4,0,0)
mov edi,edi
push ebx
push ebp
push esi
push edi
sub esp,4
SYMCRYPT_CHECK_MAGIC ecx, RC4_STATE
mov eax,[esp + Rc4CryptFrame.cbData]
test eax,eax
jz Rc4CryptDoNothing
mov ebp,[esp + Rc4CryptFrame.pbDst]
add eax,ebp
mov [esp + Rc4CryptFrame.pbEndDst], eax
mov edi, edx
movzx edx,[ecx + RC4_STATE.i]
movzx esi,[ecx + RC4_STATE.j]
;
; Further perf improvements are possible.
; Instead of encrypting byte-by-byte, we can collect 4 bytes of the key
; stream in a register, and then encrypt 4 bytes at a time.
; This reduces the # memory operations we do per byte.
; Ideally this is done with aligned operations, either
; aligning to pbSrc, pbDst, or to i (which removes the need to increment i every time).
;
@@:
; eax Ti
; ebx Tj
; ecx S
; edx i
; esi j
; edi pSrc
; ebp pDst
movzx eax, byte ptr[ecx + edx] ; Ti = S[i]
;add esi, eax
;and esi, 0ffh
lea ebx, [esi + eax]
movzx esi, bl ; j += Ti
movzx ebx, byte ptr[ecx + esi] ; Tj = S[j]
mov [ecx + edx], bl ; S[i] = Tj
mov [ecx + esi], al ; S[j] = Ti
;add eax,ebx
;and eax,0ffh
lea eax,[eax + ebx]
movzx eax,al ; Ti = Ti + Tj
mov al,[ecx + eax] ; Til = S[Ti]
;add edx, 1
;and 0ffh
lea edx,[edx + 1]
movzx edx,dl ; i += 1
xor al,[edi]
add edi,1
mov [ebp],al
add ebp, 1
cmp ebp,[esp + Rc4CryptFrame.pbEndDst]
jb @B
mov eax, esi
mov [ecx + RC4_STATE.i], dl
mov [ecx + RC4_STATE.j], al
Rc4CryptDoNothing:
add esp,4
pop edi
pop esi
pop ebp
pop ebx
ret 8
@SymCryptRc4CryptAsm@16 ENDP
_TEXT ENDS
END

Просмотреть файл

@ -1,383 +0,0 @@
;
; Sha1Asm.Asm
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
;
;
; This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
; for the x86 processor architecture.
;
; This implementation is derived from an older one by Scott Field and
; Dan Shumow.
;
; This implementation is optimized for Intel Core and contemporary AMD CPUs.
; Optimizations for pre-P3 Intel CPUs has been removed.
;
TITLE sha1asm.asm
.486
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
ASSUME CS:_TEXT, DS:FLAT, SS:FLAT
PUBLIC @SymCryptSha1AppendBlocksAsm@12
;
; The four round constants used by SHA-1
;
K0_19 EQU 05a827999H
K20_39 EQU 06ed9eba1H
K40_59 EQU 08f1bbcdcH
K60_79 EQU 0ca62c1d6H
align 16
;VOID
;SYMCRYPT_CALL
;SymCryptSha1AppendBlocks( _Inout_updates_( 5 ) PUINT32 H,
; _In_reads_bytes_( cbData ) PCBYTE pbData,
; SIZE_T cbData )
;
@SymCryptSha1AppendBlocksAsm@12 PROC
;
; To keep stack manipulatins simple we define a structure and use that for all accesses.
;
SymCryptSha1AppendBlocksFrame struct 4, NONUNIQUE
Wbuf dd 16 dup (?)
Hptr dd ?
pbData dd ?
BlockCount dd ?
SaveEdi dd ?
SaveEsi dd ?
SaveEbp dd ?
SaveEbx dd ?
ReturnAddress dd ?
CbData dd ?
SymCryptSha1AppendBlocksFrame ends
;
; We use the W buffer extensively; this is a shorthand for the base address
;
W equ esp+SymCryptSha1AppendBlocksFrame.Wbuf
;
; The .FPO provides debugging information for stack frames that do not use
; ebp as a base pointer.
; This stuff not well documented,
; but here is the information I've gathered about the arguments to .FPO
;
; In order:
; cdwLocals: Size of local variables, in DWords
; cdwParams: Size of parameters, in DWords. Given that this is all about
; stack stuff, I'm assuming this is only about parameters passed
; on the stack.
; cbProlog : Number of bytes in the prolog code. We sometimes interleaved the
; prolog code with work for better performance. Most uses of
; .FPO seem to set this value to 0.
; The debugger seems to work if the prolog defined by this value
; contains all the stack adjustments.
; cbRegs : # registers saved in the prolog. 4 in our case
; fUseBP : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
; cbFrame : Type of frame.
; 0 = FPO frame (no frame pointer)
; 1 = Trap frame (result of a CPU trap event)
; 2 = TSS frame
;
; Having looked at various occurrences of .FPO in the Windows code it
; seems to be used fairly sloppy, with lots of arguments left 0 even when
; they probably shouldn't be according to the spec.
;
.FPO(23,1,3,4,0,0) ; 3 byte prolog (covers esp ajustment only)
; At this point:
; ecx = H
; edx = pbData
; [esp+4] = cbData
;
; Set up our stack frame and save non-volatile registers
;
sub esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbp],ebp
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEdi],edi
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEsi],esi
mov [esp+SymCryptSha1AppendBlocksFrame.SaveEbx],ebx
mov [esp+SymCryptSha1AppendBlocksFrame.Hptr], ecx
;
; To allow macro re-ordering of our registers we use symbolic names
; for the registers.
; r0-r4 are the 5 state registers. x1 and x2 are extra scratch registers.
; Note: some prolog code puts the right value in the right register and
; has to be updated if this mapping is changed.
;
r0 EQU eax
r1 EQU ebx
r2 EQU ecx
r3 EQU edx
r4 EQU esi
x1 EQU ebp
x2 EQU edi
;
; compute how many blocks we will process.
; This code ensures that we never exceed the data buffer we were given,
; although we silently round the cbData parameter down to the next
; multiple of 64.
; Do nothing if no blocks need to be processed.
;
mov eax,[esp+SymCryptSha1AppendBlocksFrame.CbData]
shr eax,6
jz SymCryptSha1AppendBlocksDone
mov [esp+SymCryptSha1AppendBlocksFrame.BlockCount], eax
;
; The data pointer goes into x1 = ebp at the start of our loop
;
mov ebp,edx
;
; Load the H state from [ecx], making sure we load the r2=ecx register
; last.
;
mov r0,[ecx ]
mov r4,[ecx+ 4]
mov r3,[ecx+ 8]
mov r1,[ecx+16]
mov r2,[ecx+12]
SymCryptSha1AppendBlocksLoop:
;
; This is the main loop. We process 64 bytes in each iteration.
; invariant: ebp = pbData
;
;
; Most of the code in the loop is generated through macros using parameters to
; rename the registers.
; The macros get the register number passed as parameter. They use
; "r&<param>" to paste the number and the 'r' together and get the register
; name we defined above.
;
ROUND_CH_0_15 MACRO round,ra,rb,rc,rd,re,x1,x2
;
; Code for round 0-15.
; This code loads data from the data buffer & BSWAPs the data to get it into the
; right form.
;
; Parameters:
; round round number
; ra register number that contains the a value
; rb register number that contains the b value
; rc register number that contains the c value
; rd register number that contains the d value
; re register number that contains the e value
; x1 pointer to the input data
; x2 scratch register.
;
; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
; We start with the d value as that is the oldest value and available the first
;
; See FIPS 180-2 for our symbolic notation.
;
mov x2,[x1+4*round] ; Fetch word from message
bswap x2 ; x2 = Wt
add r&re,x2 ; re = e + Wt
mov [W + 4*round],x2 ; Store in W buffer for future use
mov x2,r&ra ; x2 = a
rol x2,5 ; x2 = ROL(a,5)
add r&re,x2 ; re = e + Wt + ROL(a,5)
mov x2,r&rd ; x2 = d
xor x2,r&rc ; x2 = (d ^ c)
and x2,r&rb ; x2 = ((d ^ c) & b)
ror r&rb,2 ; rb = ROL( b, 30 )
xor x2,r&rd ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
lea r&re,[r&re+x2+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
ENDM
ROUND_CH MACRO round, ra, rb, rc, rd, re, x1, x2
;
; See ROUND_CH_0_15 for most parameters.
; x1 and x2 are both scratch registers
;
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
mov x1,r&ra ; x1 = a
rol x1,5 ; x1 = ROL(a,5)
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
add r&re,x1 ; re = e + ROL(a,5)
mov x1,r&rd ; x1 = d
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
xor x1,r&rc ; x1 = (d ^ c)
and x1,r&rb ; x1 = ((d ^ c) & b)
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
xor x1,r&rd ; x1 = ((d ^ c) & b) ^ d = CH(b,c,d)
rol x2,1 ; x2 = Wt
mov [W+4*((round-16) MOD 16)],x2 ;
add r&re,x2 ; re = e + ROL(a,5) + Wt
ror r&rb,2 ; rb = ROL( b, 30 )
lea r&re,[r&re+x1+K0_19] ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
ENDM
ROUND_PARITY MACRO round, ra, rb, rc, rd, re, x1, x2, K, store
;
; See ROUND_CH for most parameters
; K is the round constant to use.
; store is 1 if the Wt value should be stored, 0 otherwise
; (used to avoid stores in the last few rounds)
;
; The order of xorring the registers b, c, and d is driven by the data dependency graph.
; We start with d (the oldest) and then do b to unblock the subsequent rotate
;
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
mov x1,r&ra ; x1 = a
rol x1,5 ; x1 = ROL(a,5)
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
add r&re,x1 ; re = e + ROL(a,5)
mov x1,r&rd ; x1 = d
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
xor x1,r&rb ; x1 = (d ^ b)
xor x1,r&rc ; x1 = (d ^ b ^ c) = Parity(b,c,d)
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
rol x2,1 ; x2 = Wt
add r&re,x1 ; re = e + ROL(a,5) + Parity(b,c,d)
IF store
mov [W+4*((round-16) MOD 16)],x2 ;
ENDIF
ror r&rb,2 ; rb = ROL( b, 30 )
lea r&re,[r&re+x2+K] ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
ENDM
ROUND_MAJ MACRO round, ra, rb, rc, rd, re, x1, x2
;
; See above for parameter explanation
;
mov x2,[W+4*((round-16) MOD 16)] ; x2 = W[t-16]
mov x1,r&ra ; x1 = a
rol x1,5 ; x1 = ROL(a,5)
xor x2,[W+4*((round-14) MOD 16)] ; x2 = W[t-16] ^ W[t-14]
add r&re,x1 ; re = e + ROL(a,5)
mov x1,r&rd ; x1 = d
xor x2,[W+4*((round- 8) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
or x1,r&rc ; x1 = (d | c)
and x1,r&rb ; x1 = ((d | c) & b)
xor x2,[W+4*((round- 3) MOD 16)] ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] = Wt
rol x2,1 ; x2 = Wt
add r&re,x2 ; re = e + ROL(a,5) + Wt
mov [W+4*((round-16) MOD 16)],x2 ;
mov x2,r&rc ; x2 = c
and x2,r&rd ; x2 = (c & d)
or x1,x2 ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
ror r&rb,2 ; rb = ROL( b, 30 )
lea r&re,[r&re+x1+K40_59] ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
ENDM
;
; With these macros we can now produce the actual code.
; Note the use of the % operator which evaluates the expression and yields the result as text.
; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
; for each round.
;
FOR t, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
ROUND_CH_0_15 t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
ENDM
;
; For the rest of the computation we need the extra register, so we update the data pointer and store it.
;
add ebp,64
mov [esp+SymCryptSha1AppendBlocksFrame.pbData], ebp
FOR t, <16, 17, 18, 19>
ROUND_CH t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
ENDM
FOR t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K20_39, 1
ENDM
FOR t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
ROUND_MAJ t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
ENDM
FOR t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76>
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 1
ENDM
;
; The last three rounds do not need to store their Wt in the W buffer as that value will never get used.
;
FOR t, <77, 78, 79>
ROUND_PARITY t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 0
ENDM
;
; Now we update the state
;
mov x2,[esp+SymCryptSha1AppendBlocksFrame.Hptr]
add r0,[x2 ]
add r4,[x2+ 4]
add r3,[x2+ 8]
add r2,[x2+12]
add r1,[x2+16]
mov [x2 ], r0
mov [x2+ 4], r4
mov [x2+ 8], r3
mov [x2+12], r2
mov [x2+16], r1
;
; See if we have more data to process, and load the data pointer register again
;
dec [esp+SymCryptSha1AppendBlocksFrame.BlockCount]
mov ebp, [esp+SymCryptSha1AppendBlocksFrame.pbData]
jnz SymCryptSha1AppendBlocksLoop
;
; We're done processing the blocks. The result is already in the state, so all we have to do
; is clean up.
;
; Wipe the W buffer
; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
;
mov ecx,8
xor eax,eax
@@: dec ecx
mov [esp+8*ecx],eax
mov [esp+8*ecx+4],eax
jnz @B
SymCryptSha1AppendBlocksDone:
;
; Restore non-volatile regisers & stackpointer
;
mov ebp,[esp+SymCryptSha1AppendBlocksFrame.SaveEbp]
mov edi,[esp+SymCryptSha1AppendBlocksFrame.SaveEdi]
mov esi,[esp+SymCryptSha1AppendBlocksFrame.SaveEsi]
mov ebx,[esp+SymCryptSha1AppendBlocksFrame.SaveEbx]
add esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
ret 4
@SymCryptSha1AppendBlocksAsm@12 ENDP
_TEXT ENDS
END

Просмотреть файл

@ -7,9 +7,7 @@
#include "precomp.h"
#define EQU =
#include "C_asm_shared.inc"
#undef EQU
#include "buildInfo.h"
@ -34,16 +32,16 @@ SymCryptLibraryWasNotInitialized()
#endif
const CHAR * SymCryptBuildString =
"v" SYMCRYPT_BUILD_INFO_VERSION
"_" SYMCRYPT_BUILD_INFO_BRANCH
const CHAR * SymCryptBuildString =
"v" SYMCRYPT_BUILD_INFO_VERSION
"_" SYMCRYPT_BUILD_INFO_BRANCH
"_" SYMCRYPT_BUILD_INFO_COMMIT
"_" SYMCRYPT_BUILD_INFO_TIMESTAMP;
VOID
SYMCRYPT_CALL
SymCryptInitEnvCommon( UINT32 version )
// Returns TRUE if the initializatoin steps have to be performed.
// Returns TRUE if the initialization steps have to be performed.
{
UINT32 tmp;

Просмотреть файл

@ -1,223 +0,0 @@
//
// asmstubs.c
// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#include "../precomp.h"
extern const SYMCRYPT_BLOCKCIPHER SymCryptAesBlockCipherNoOpt;
VOID
SYMCRYPT_CALL
SymCryptAesEncryptAsm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc,
_Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst )
{
SymCryptAesEncryptC( pExpandedKey, pbSrc, pbDst );
}
VOID
SYMCRYPT_CALL
SymCryptAesDecryptAsm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_In_reads_(SYMCRYPT_AES_BLOCK_SIZE) PCBYTE pbSrc,
_Out_writes_(SYMCRYPT_AES_BLOCK_SIZE) PBYTE pbDst )
{
SymCryptAesDecryptC( pExpandedKey, pbSrc, pbDst );
}
VOID
SYMCRYPT_CALL
SymCryptAesCbcEncryptAsm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
SymCryptCbcEncrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
VOID
SYMCRYPT_CALL
SymCryptAesCbcDecryptAsm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
SymCryptCbcDecrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb64Asm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
SymCryptCtrMsb64( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
VOID
SYMCRYPT_CALL
SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
{
volatile BYTE * p = (volatile BYTE *) pbData;
SIZE_T i;
for( i=0; i<cbData; i++ ){
p[i] = 0;
}
}
VOID
SYMCRYPT_CALL
SymCryptFdefMaskedCopyC(
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
UINT32 nDigits,
UINT32 mask );
VOID
SYMCRYPT_CALL
SymCryptFdefMaskedCopyAsm(
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
UINT32 nDigits,
UINT32 mask )
{
SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
}
UINT32
SYMCRYPT_CALL
SymCryptFdefRawAddC(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits );
UINT32
SYMCRYPT_CALL
SymCryptFdefRawAddAsm(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits )
{
return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
}
UINT32
SYMCRYPT_CALL
SymCryptFdefRawSubC(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits );
UINT32
SYMCRYPT_CALL
SymCryptFdefRawSubAsm(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits )
{
return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulC(
_In_reads_(nWords1) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nWords2) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulMulx(
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
{
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulAsm(
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
{
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareC(
_In_reads_(nWords) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareMulx(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
{
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareAsm(
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst )
{
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
}
VOID
SymCryptFdefMontgomeryReduceC(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceMulx(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Out_ PUINT32 pDst )
{
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
}
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceAsm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Out_ PUINT32 pDst )
{
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
}

12
lib/makefile.inc Normal file
Просмотреть файл

@ -0,0 +1,12 @@
.SUFFIXES: .symcryptasm .cppasm
# We still have architecture-specific inference rules because otherwise we cannot do any architecture-specific preprocessing
# Preprocess amd64 .symcryptasm into masm
{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
# Preprocess x86 .cppasm into masm
{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<

Просмотреть файл

@ -2212,11 +2212,11 @@ SymCryptFdefModElementToIntGeneric(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptFdefRawSetValue(
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
_Out_writes_(nWords) PUINT32 pDst,
UINT32 nWords );
_In_reads_bytes_(cbSrc) PCBYTE pbSrc,
SIZE_T cbSrc,
SYMCRYPT_NUMBER_FORMAT format,
_Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst,
UINT32 nDigits );
SYMCRYPT_ERROR
SYMCRYPT_CALL
@ -2250,11 +2250,11 @@ SymCryptFdefModElementSetValueNegUint32(
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptFdefRawGetValue(
_In_reads_(nWords) PCUINT32 pSrc,
UINT32 nWords,
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format );
_In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_bytes_(cbBytes) PBYTE pbDst,
SIZE_T cbDst,
SYMCRYPT_NUMBER_FORMAT format );
SYMCRYPT_ERROR
SYMCRYPT_CALL
@ -2492,14 +2492,6 @@ SymCryptFdefRawSubUint32(
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits );
UINT32
SYMCRYPT_CALL
SymCryptFdefRawMaskedAddUint32(
_Inout_updates_( nWords ) PUINT32 pAcc,
_In_reads_( nWords ) PCUINT32 pSrc,
UINT32 mask,
UINT32 nWords );
VOID
SYMCRYPT_CALL
SymCryptFdefModMulGeneric(
@ -2530,16 +2522,6 @@ SymCryptFdefModMulMontgomery256Asm(
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch );
VOID
SYMCRYPT_CALL
SymCryptFdefModMulMontgomery256Test(
_In_ PCSYMCRYPT_MODULUS pMod,
_In_ PCSYMCRYPT_MODELEMENT pSrc1,
_In_ PCSYMCRYPT_MODELEMENT pSrc2,
_Out_ PSYMCRYPT_MODELEMENT pDst,
_Out_writes_bytes_( cbScratch ) PBYTE pbScratch,
SIZE_T cbScratch );
VOID
SYMCRYPT_CALL
SymCryptFdef369ModMulMontgomery(
@ -2684,11 +2666,11 @@ SymCryptFdefRawMul(
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulMulx(
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
@ -2696,7 +2678,7 @@ SymCryptFdefRawMulMulx1024(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
@ -2732,7 +2714,7 @@ UINT32
SYMCRYPT_CALL
SymCryptFdefRawIsEqualUint32(
_In_ PCUINT32 pSrc1,
UINT32 nWords,
UINT32 nDigits,
_In_ UINT32 u32Src2 );
UINT32
@ -2909,27 +2891,27 @@ SymCryptFdef369MaskedCopyAsm(
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulAsm(
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareAsm(
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdef369RawMulAsm(
_In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
@ -2937,14 +2919,14 @@ SymCryptFdefRawMul512Asm(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquare512Asm(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
@ -2952,69 +2934,69 @@ SymCryptFdefRawMul1024Asm(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquare1024Asm(
_In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
_Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceAsm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduce256Asm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduce512Asm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduce1024Asm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdef369MontgomeryReduce(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdef369MontgomeryReduceAsm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceMulx(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceMulx1024(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Inout_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
// Helper macro for checking for specific key validation flag using bits 4 and 5 in a flags variable

Просмотреть файл

@ -13,14 +13,29 @@ ARM64X_EC_ENABLED=1
TARGETNAME = symcrypt
TARGETTYPE=LIBRARY
KM_LIBRARY = 1 # enable /kernel flag & epilogue metadata
GUARD = 1 # enable CFG
KM_LIBRARY = 1 # enable /kernel flag & epilogue metadata
GUARD = 1 # enable CFG
ENABLE_ASM_RETPOLINE = 1
ENABLE_RETPOLINE_LINKER_WARNING = 1
# Enable /Gy for all assembler code
ASM_DEFINES=$(ASM_DEFINES) /Gy
USE_MAKEFILE_INC = 1
# Explicitly call out that we must preprocess symcryptasm files
# Make the target paths be architecture specific to get nmake to pick the right inference rule
NTTARGETFILE0=\
!IF "$(_BUILDARCH)" == "amd64"
$(OBJ_PATH)\$(O)\..\amd64\fdef_asm.asm \
$(OBJ_PATH)\$(O)\..\amd64\wipe.asm \
$(OBJ_PATH)\$(O)\..\amd64\aesasm.asm \
$(OBJ_PATH)\$(O)\..\amd64\fdef369_asm.asm \
$(OBJ_PATH)\$(O)\..\amd64\fdef_mulx.asm \
!ELSEIF "$(_BUILDARCH)" == "x86"
$(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
!ENDIF
INCLUDES= \
..\inc; \
$(DS_INC_PATH)\crypto; \
@ -137,7 +152,6 @@ SOURCES= \
scsTools.c \
AMD64_SOURCES = \
# sha1asm.asm \
wipe.asm \
aesasm.asm \
fdef_asm.asm \
@ -145,10 +159,8 @@ AMD64_SOURCES = \
fdef_mulx.asm \
I386_SOURCES = \
# sha1asm.asm \
aesasm.asm \
wipe.asm \
# rc4asm.asm \
fdef_asm.asm \
ARM_SOURCES = \

Просмотреть файл

@ -0,0 +1,36 @@
//
// symcryptasm_shared.cppasm Shared definitions used by the C preprocessor step in symcryptasm
// processing. See scripts/symcryptasm_processor.py for more details.
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#if defined(SYMCRYPT_MASM)
#if defined(SYMCRYPT_CPU_AMD64)
include ksamd64.inc
#endif
#include "C_asm_shared.inc"
#define FILE_END() END
#define ALIGN(__alignment) align __alignment
#define GET_SYMBOL_ADDRESS(__symbol) __symbol
#define HEX(__constant) __constant##h
#elif defined(SYMCRYPT_GAS)
.intel_syntax noprefix
#include "C_asm_shared.inc"
#define FILE_END()
#define ALIGN(__alignment) .align __alignment
#define GET_SYMBOL_ADDRESS(__symbol) __symbol+rip
#define HEX(__constant) 0x##__constant
#else
#error Unknown target assembly
#endif

Просмотреть файл

@ -0,0 +1,657 @@
#!/usr/bin/env python3
"""
This script enables processing of symcryptasm files so that they can be assembled in a variety of
environments without requiring forking or duplication of source files - symcryptasm files phrase
assembly in an assembler and environment agnostic way.
The current target assemblers are:
MASM and GAS
The current target environments are:
amd64 Windows (using the Microsoft x64 calling convention), and
amd64 Linux (using the SystemV amd64 calling convention)
Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
this effort.
The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
substitutions, outputting a .asm file which can be assembled by thetarget assembler for the target
environment.
We have set up the intermediate generated files to be created in the output directories in both
razzle and CMake builds.
### symcryptasm syntax ###
Different calling conventions pass arguments to functions in different registers, have differing
numbers of volatile and non-volatile registers, and use the stack in different ways.
We define our own register naming scheme which abstracts away the differences between calling
conventions. The generalities of the naming scheme will be similar across target architectures, but
refer to the Architecture specifics below for details. For the following general information we use
the notation R<n> to denote registers in the symcryptasm register naming scheme.
A leaf function (a function which does not call another function) begins with an invocation of the
FUNCTION_START macro which currently takes 3 arguments:
1) The function name
This must be the name that matches the corresponding declaration of the function
2) The number of arguments (arg_count) that the function takes
These arguments will be accessible in some contiguous region of the symcrypt registers at the
start of the function
On amd64 this contiguous region is R1..R<arg_count>
Note: arg_count need not correspond to the exact number of argument in the function declaration
if the assembly does not use some tail of the arguments
3) The number of registers (reg_count) that the function uses
These registers will be accessible as R0..R<reg_count-1>
A leaf function ends with the FUNCTION_END macro, which also takes the function name
(a FUNCTION_END macro's function name must match the preceding FUNCTION_START's name)
At the function start a prologue is generated which arranges the arguments appropriately in
registers, and saves non-volatile registers that have been requested to be used.
At the function end an epilogue is generated with restores the non-volatile registers and returns.
A nested function (a function which does call another function) is specified similarly, only using
NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and align
the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
number of macros argument names. It ends with MACRO_END.
### Architecture specifics ###
### amd64 ###
We allow up to 15 registers to be addressed, with the names:
Q0-Q15 (64-bit registers), W0-W15 (32-bit registers), H0-H15 (16-bit registers), and B0-B15 (8-bit
registers)
Xmm0-Xmm5 registers may be used directly in assembly too, as in both amd64 calling conventions we
currently support, these registers are volatile so do not need any special handling
On function entry we insert a prologue which ensures:
Q0 is the result register (the return value of the function, and the low half of a multiplication)
Q1-Q6 are the first 6 arguments passed to the function
Additionally, there is a special case for functions using mul or mulx instructions, as these
instructions make rdx a special register. Functions using these instructions may address Q0-Q14,
and QH. As rdx is used to pass arguments, its value is moved to another register in the function
prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
We currently do not support nested mul functions, as we have none of them.
"""
import re
import types
import logging
class Register:
"""A class to represent registers"""
def __init__(self, name64, name32, name16, name8):
self.name64 = name64
self.name32 = name32
self.name16 = name16
self.name8 = name8
# amd64 registers
REG_RAX = Register("rax", "eax", "ax", "al")
REG_RBX = Register("rbx", "ebx", "bx", "bl")
REG_RCX = Register("rcx", "ecx", "cx", "cl")
REG_RDX = Register("rdx", "edx", "dx", "dl")
REG_RSI = Register("rsi", "esi", "si", "sil")
REG_RDI = Register("rdi", "edi", "di", "dil")
REG_RSP = Register("rsp", "esp", "sp", "spl")
REG_RBP = Register("rbp", "ebp", "bp", "bpl")
REG_R8 = Register( "r8", "r8d", "r8w", "r8b")
REG_R9 = Register( "r9", "r9d", "r9w", "r9b")
REG_R10 = Register("r10", "r10d", "r10w", "r10b")
REG_R11 = Register("r11", "r11d", "r11w", "r11b")
REG_R12 = Register("r12", "r12d", "r12w", "r12b")
REG_R13 = Register("r13", "r13d", "r13w", "r13b")
REG_R14 = Register("r14", "r14d", "r14w", "r14b")
REG_R15 = Register("r15", "r15d", "r15w", "r15b")
class CallingConvention:
"""A class to represent calling conventions"""
def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
self.name = name
self.architecture = architecture
self.mapping = mapping
self.argument_registers = argument_registers
self.volatile_registers = volatile_registers
self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
self.gen_epilogue_fn = types.MethodType(gen_epilogue_fn, self)
self.gen_get_memslot_offset_fn = types.MethodType(gen_get_memslot_offset_fn, self)
def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
"""Gets the register mapping used in functions requiring special rdx handling.
In amd64, when using mul and mulx, rdx is a special register.
rdx is also used for passing arguments in both Msft and System V calling conventions.
In asm functions that use mul or mulx, we will explicitly move the argument passed in
rdx to a different volatile register in the function prologue, and in the function body
we refer to rdx using (Q|D|W|B)H.
"""
rdx_index = None
return_mapping = { 'H': REG_RDX }
for (index, register) in mapping.items():
if register == REG_RDX:
rdx_index = index
break
for (index, register) in mapping.items():
# preserve argument registers
if (index <= argument_registers) and (index != rdx_index):
return_mapping[index] = register
# replace rdx with the first non-argument register
if index == argument_registers+1:
return_mapping[rdx_index] = register
# shuffle all later registers down to fill the gap
if index > argument_registers+1:
return_mapping[index-1] = register
return return_mapping
# Calling convention constants
MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
MAX_FUNCTION_REGISTER_COUNT = 15
# Microsoft x64 calling convention
MAPPING_AMD64_MSFT = {
0: REG_RAX, # Result register
1: REG_RCX, # Argument 1 / volatile
2: REG_RDX, # Argument 2 / volatile
3: REG_R8, # Argument 3 / volatile
4: REG_R9, # Argument 4 / volatile
5: REG_R10, # volatile
6: REG_R11, # volatile
7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
8: REG_RDI,
9: REG_RBP,
10:REG_RBX,
11:REG_R12,
12:REG_R13,
13:REG_R14,
14:REG_R15,
# currently not mapping rsp
}
def calc_amd64_shadow_space_allocation_size(self, reg_count):
# If we are a nested function, we must allocate 32B of shadow space on the stack, and ensure the
# stack pointer is aligned to 16B
# Before the prologue we have rsp % 16 == 8 - as the call pushed an 8B return address on an
# aligned stack
alignment = 8
# We then pushed some number of additional 8B registers onto the stack
if reg_count > self.volatile_registers:
alignment = (alignment + (8 * (self.volatile_registers - reg_count))) % 16
shadow_space_allocation_size = 32
if alignment == 8:
# possibly allocate 8 more bytes to align the stack to 16B
shadow_space_allocation_size += 8
return shadow_space_allocation_size
def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=False):
prologue = "\n"
if reg_count > self.volatile_registers:
prologue += "rex_push_reg Q%s\n" % self.volatile_registers
for i in range(self.volatile_registers+1, reg_count):
prologue += "push_reg Q%s\n" % i
prologue += "\nEND_PROLOGUE\n\n"
shadow_space_allocation_size = 0
if nested:
shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
prologue += "sub rsp, %d // allocate shadow space and align stack\n\n" % shadow_space_allocation_size
prologue += mul_fixup
# put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
# stack_offset to get the 5th argument is:
# 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
stack_offset += 8
return prologue
def gen_prologue_amd64_msft_mul(self, arg_count, reg_count):
return gen_prologue_amd64_msft(self, arg_count, reg_count, "mov Q2, QH\n")
def gen_prologue_amd64_msft_nested(self, arg_count, reg_count):
return gen_prologue_amd64_msft(self, arg_count, reg_count, "", nested=True)
def gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=False):
epilogue = ""
if nested:
shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
epilogue += "add rsp, %d // deallocate shadow space and align stack\n\n" % shadow_space_allocation_size
if reg_count > self.volatile_registers:
epilogue += "BEGIN_EPILOGUE\n"
for i in reversed(range(self.volatile_registers, reg_count)):
epilogue += "pop Q%s\n" % i
epilogue += "ret\n"
return epilogue
def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
return gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=True)
def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
# only support 4 memory slots for now (in shadow space)
if(slot >= 4):
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
exit(1)
# 8B for return address + (8*#pushed registers in prologue)
stack_offset = 8 + (8*(reg_count-self.volatile_registers))
if nested:
stack_offset += calc_amd64_shadow_space_allocation_size(self, reg_count)
return "%d /*MEMSLOT%d*/" % (stack_offset+(8*slot), slot)
def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
# AMD64 System V calling convention
MAPPING_AMD64_SYSTEMV = {
0: REG_RAX, # Result register
1: REG_RDI, # Argument 1 / volatile
2: REG_RSI, # Argument 2 / volatile
3: REG_RDX, # Argument 3 / volatile
4: REG_RCX, # Argument 4 / volatile
5: REG_R8, # Argument 5 / volatile
6: REG_R9, # Argument 6 / volatile
7: REG_R10, # volatile
8: REG_R11, # volatile
9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
10:REG_RBP,
11:REG_R12,
12:REG_R13,
13:REG_R14,
14:REG_R15
# currently not mapping rsp
}
def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=False):
# push volatile registers onto the stack
prologue = "\n"
if reg_count > self.volatile_registers:
for i in range(self.volatile_registers, reg_count):
prologue += "push Q%s\n" % i
# If we are a nested function, we need to align the stack to 16B, and allocate space for up to 4
# memory slots not in the redzone. We can use the same logic as on the MSFT x64 side to allocate
# our own space for 32B of local variables (whereas on the MSFT side, we use this for allocating
# space for a function we are about to call)
if nested:
allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
prologue += "sub rsp, %d // allocate memslot space and align stack\n\n" % allocation_size
prologue += mul_fixup
# do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
# # put additional arguments into Q7-Qn
# # stack_offset to get the 7th argument is:
# # 8B for return address
# stack_offset = 8
# for i in range(self.argument_registers+1, arg_count+1):
# prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
# stack_offset += 8
return prologue
def gen_prologue_amd64_systemv_mul(self, arg_count, reg_count):
return gen_prologue_amd64_systemv(self, arg_count, reg_count, "mov Q3, QH\n")
def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count):
return gen_prologue_amd64_systemv(self, arg_count, reg_count, "", nested=True)
def gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=False):
epilogue = ""
if nested:
allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
epilogue += "add rsp, %d // deallocate memslot space and align stack\n\n" % allocation_size
if reg_count > self.volatile_registers:
for i in reversed(range(self.volatile_registers, reg_count)):
epilogue += "pop Q%s\n" % i
epilogue += "ret\n"
return epilogue
def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
return gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=True)
def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
# only support 4 memory slots for now
if(slot >= 4):
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
exit(1)
# For leaf functions, use the top of the redzone below the stack pointer
offset = -8 * (slot+1)
if nested:
# For nested functions, use the 32B of memslot space above the stack pointer created in the prologue
offset = 8*slot
return "%d /*MEMSLOT%d*/" % (offset, slot)
def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count):
return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
def gen_function_start_defines(mapping, arg_count, reg_count):
defines = ""
for (index, reg) in mapping.items():
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
continue
defines += "#define Q%s %s\n" % (index, reg.name64)
defines += "#define D%s %s\n" % (index, reg.name32)
defines += "#define W%s %s\n" % (index, reg.name16)
defines += "#define B%s %s\n" % (index, reg.name8)
return defines
def gen_function_end_defines(mapping, arg_count, reg_count):
undefs = ""
for (index, _) in mapping.items():
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
continue
undefs += "#undef Q%s\n" % (index)
undefs += "#undef D%s\n" % (index)
undefs += "#undef W%s\n" % (index)
undefs += "#undef B%s\n" % (index)
return undefs
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n"
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n"
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n"
MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n"
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
GAS_FUNCTION_END = ""
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
function_entry = None
if assembler == "masm":
# need to identify and mark up frame functions in masm
if nested or (reg_count > calling_convention.volatile_registers):
function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
else:
function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
elif assembler == "gas":
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
prologue += "%s" % (function_entry)
prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
return prologue
def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
function_end = None
if assembler == "masm":
# need to identify and mark up frame functions in masm
if nested or (reg_count > calling_convention.volatile_registers):
function_end = MASM_FRAME_FUNCTION_END % (function_name)
else:
function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
elif assembler == "gas":
function_end = GAS_FUNCTION_END
epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
epilogue += "%s" % (function_end)
epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
return epilogue
MASM_MACRO_START = "%s MACRO %s\n"
MASM_MACRO_END = "ENDM\n"
GAS_MACRO_START = ".macro %s %s\n"
GAS_MACRO_END = ".endm\n"
MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
MACRO_END_PATTERN = re.compile("\s*MACRO_END\s*\(\s*\)")
class ProcessingStateMachine:
"""A class to hold the state when processing a file and handle files line by line"""
def __init__(self, assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention):
self.assembler = assembler
self.normal_calling_convention = normal_calling_convention
self.mul_calling_convention = mul_calling_convention
self.nested_calling_convention = nested_calling_convention
self.function_start_match = None
self.function_start_line = 0
self.is_nested_function = None
self.is_mul_function = None
self.calling_convention = None
self.function_name = None
self.arg_count = None
self.reg_count = None
self.macro_start_match = None
self.macro_name = None
self.macro_args = None
def process_line(self, line, line_num):
if self.function_start_match == None and self.macro_start_match == None:
return self.process_normal_line(line, line_num)
elif self.function_start_match != None:
return self.process_function_line(line, line_num)
elif self.macro_start_match != None:
return self.process_macro_line(line, line_num)
else:
logging.error("Whoops, something is broken with the state machine (failed at line %d)" % line_num)
exit(1)
def process_normal_line(self, line, line_num):
# Not currently in a function or macro
match = FUNCTION_START_PATTERN.match(line)
if (match):
return self.process_start_function(match, line, line_num)
match = MACRO_START_PATTERN.match(line)
if (match):
return self.process_start_macro(match, line, line_num)
# Not starting a function or a macro
return line
def process_start_function(self, match, line, line_num):
# Entering a new function
self.function_start_match = match
self.function_start_line = line_num
self.is_nested_function = (match.group(1) == "NESTED_")
self.is_mul_function = (match.group(2) == "MUL_")
self.function_name = match.groups()[-3]
self.arg_count = int(match.groups()[-2])
self.reg_count = int(match.groups()[-1])
if self.is_nested_function and self.is_mul_function:
logging.error(
"Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
"%s (line %d)"
% (line, line_num))
exit(1)
if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
logging.error(
"Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
"%s (line %d)"
% (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
exit(1)
if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
logging.error(
"Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
"%s (line %d)"
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
exit(1)
if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
logging.error(
"Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
"%s (line %d)"
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
exit(1)
logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
if self.is_nested_function:
self.calling_convention = self.nested_calling_convention
elif self.is_mul_function:
self.calling_convention = self.mul_calling_convention
else:
self.calling_convention = self.normal_calling_convention
return generate_prologue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
def process_start_macro(self, match, line, line_num):
self.macro_start_match = match
self.macro_name = match.group(1)
self.macro_args = [ x.strip() for x in match.group(2).split(",") ]
logging.info("%d: macro start %s, %s" % (line_num, self.macro_name, self.macro_args))
if self.assembler == "masm":
return MASM_MACRO_START % (self.macro_name, match.group(2))
elif self.assembler == "gas":
return GAS_MACRO_START % (self.macro_name, match.group(2))
def process_function_line(self, line, line_num):
# Currently in a function
match = ALTERNATE_ENTRY_PATTERN.match(line)
if (match):
if self.assembler == "masm":
return MASM_ALTERNATE_ENTRY % match.group(1)
elif self.assembler == "gas":
return GAS_ALTERNATE_ENTRY % (match.group(1), match.group(1))
match = FUNCTION_END_PATTERN.match(line)
if (match):
# Check the end function has same prefix as previous start function
if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
(self.is_mul_function ^ (match.group(2) == "MUL_")):
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
% (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
exit(1)
# Check the end function pattern has the same label as the previous start function pattern
if self.function_name != match.groups()[-1]:
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
% (self.function_name, self.function_start_line, match.groups()[-1], line_num))
exit(1)
epilogue = generate_epilogue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
logging.info("%d: function end %s" % (line_num, self.function_name))
self.function_start_match = None
self.function_start_line = 0
self.is_nested_function = None
self.is_mul_function = None
self.calling_convention = None
self.function_name = None
self.arg_count = None
self.reg_count = None
return epilogue
# replace any GET_MEMSLOT_OFFSET macros in line
match = GET_MEMSLOT_PATTERN.search(line)
while(match):
slot = int(match.group(1))
replacement = self.calling_convention.gen_get_memslot_offset_fn(slot, self.arg_count, self.reg_count)
line = GET_MEMSLOT_PATTERN.sub(replacement, line)
match = GET_MEMSLOT_PATTERN.search(line)
logging.info("%d: memslot macro %d" % (line_num, slot))
# Not modifying the line any further
return line
def process_macro_line(self, line, line_num):
# Currently in a macro
match = MACRO_END_PATTERN.match(line)
if (match):
logging.info("%d: macro end %s" % (line_num, self.macro_name))
self.macro_start_match = None
self.macro_name = None
self.macro_args = None
if self.assembler == "masm":
return MASM_MACRO_END
elif self.assembler == "gas":
return GAS_MACRO_END
if self.assembler == "gas":
# In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
for arg in self.macro_args:
line = re.sub(arg, r"\\%s" % arg, line)
# Not modifying the line any further
return line
def process_file(target, infilename, outfilename):
assembler = None
if target == "masm":
assembler = "masm"
normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
elif target == "gas":
assembler = "gas"
normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
# iterate through file line by line in one pass
file_processing_state = ProcessingStateMachine(
assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention)
with open(infilename) as infile:
with open(outfilename, "w") as outfile:
for line_num, line in enumerate(infile):
processed_line = file_processing_state.process_line(line, line_num)
outfile.write(processed_line)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
parser.add_argument('target', type=str, help='Target that we want to preprocess for')
parser.add_argument('inputfile', type=str, help='Path to input file')
parser.add_argument('outputfile', type=str, help='Path to output file')
args = parser.parse_args()
process_file(args.target, args.inputfile, args.outputfile)

Просмотреть файл

@ -7,7 +7,6 @@
#include "precomp.h"
#define EQU =
#include "C_asm_shared.inc"
VOID