Merged PR 5854070: Introduce symcryptasm format to enable use of asm in Windows and Linux

+ Introduce a 2 stage pre-processing setup to convert .symcryptasm to either masm (msft x64 calling convention) or gas (SystemV amd64 calling convention) + Step 1 converts .symcryptasm to .cppasm (using `lib\symcryptasm_processor.py`) + Step 2 converts .cppasm to .asm using the C preprocessor + Updated CMakeLists.txt to invoke this preprocesssing when any relevant files is updated + Also introduced makefile.inc for the razzle build + I have translated all of the amd64 asm files we want to preserve, and the performance for big integer reliant code is the same on Windows and Linux (and a bit better on Windows than before :)) + In translation I did some tidying of the underlying assembly: + Removing needless work (some size specific functions in particular had cruft from their adaptation from the generic sized versions) + Reducing code size (i.e. by using inc/dec rather than add/sub 1) + Some micro-optimizations to remove needless instruction dependencies Related work items: #30621935
2021-04-23 11:33:23 +00:00 · 2021-04-23 11:33:23 +00:00 · 77d1e446e4
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -28,9 +28,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/${CMAKE_SYSTEM_PROCES
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/exe/${CMAKE_SYSTEM_PROCESSOR}/${SYMCRYPT_TARGET_ENV})

 if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
-    # Set DBG=1 and enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in
-    # the toolchain file
-    add_compile_options(-DDBG=1)
+    # Enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in the
+    # toolchain file
    enable_language(ASM_MASM)
    add_compile_options(/MP)
    # Remove /RTC1, incompatible of /Ox
@ -43,16 +42,23 @@ if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
    string( REPLACE "/Od" "" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
    string( REPLACE "/Od" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
    string( REPLACE "/Od" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    
-    IF(CMAKE_BUILD_TYPE MATCHES Release)
-        message("Release mode")
+
+    if(CMAKE_BUILD_TYPE MATCHES Release)
        add_compile_options(/Oxs)
-    ENDIF(CMAKE_BUILD_TYPE MATCHES Release)
+    endif()
 elseif(NOT WIN32)
    enable_language(ASM)
    add_compile_options(-Wno-deprecated-declarations -Wno-deprecated)
    add_compile_options(-g)
    add_compile_options(-Wno-multichar)
+    add_compile_options(-fPIC)
+endif()
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+    message("Release mode")
+else()
+    message("Debug mode")
+    add_compile_options(-DDBG=1)
 endif()

 include_directories(inc)
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# Introduction 
+# Introduction
 SymCrypt is the core cryptographic function library currently used by Windows.

 ## History
@ -30,20 +30,23 @@ or gcc 7.4.0 on Linux. Note that CMake ships with Visual Studio 2019.
 4. Configure CMake compilation:
    * For 32-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-x86.cmake -A Win32`
    * For 64-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-amd64.cmake`
-    * For Linux (or Windows with no CPU optimizations): `cmake ..`
+    * For 64-bit Linux targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/linux-amd64.cmake`
+    * For no CPU optimizations: `cmake ..`
+    * Optionally, for a release build, specify `-DCMAKE_BUILD_TYPE=Release`
 5. `cmake --build .`
+    * Optionally specify -jN where N is the number of processes you wish to spawn for the build

 If compilation succeeds, the output will be put in the `exe` subdirectory relative to where compilation occurred
 (i.e. `bin/exe` if you followed the instructions above).

 The SymCrypt unit test is in the `unittest` directory. It runs extensive functional tests on the SymCrypt
 library. On Windows it also compares results against on other implementations such as the Windows APIs CNG
-and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides 
+and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
 detailed performance information.

 # Security Bugs
 If you believe you have found a problem that affects the security of this code, please do **NOT** create an issue
-or pull request, but instead email your comments to secure@microsoft.com. 
+or pull request, but instead email your comments to secure@microsoft.com.

 # Contribute
 We love to receive comments and suggestions. Unfortunately we cannot accept external code contributions at this time.
--- a/cmake-toolchain/linux-amd64.cmake
+++ b/cmake-toolchain/linux-amd64.cmake
@ -10,7 +10,6 @@ set(SYMCRYPT_TARGET_ENV Linux)

 # Define _AMD64_ to set up the correct SymCrypt macros, e.g. SYMCRYPT_CPU_AMD64
 add_compile_options(-D_AMD64_)
-add_compile_options(-DDBG)
 add_compile_options(-O3)

 # Enable a baseline of features for the compiler to support everywhere
--- a/inc/C_asm_shared.inc
+++ b/inc/C_asm_shared.inc
@ -1,70 +1,43 @@
-;/*
-; C_asm_shared.inc  file to synchronize C and Asm information
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+/*
+ C_asm_shared.inc  file to synchronize C and Asm information
+ Copyright (c) Microsoft Corporation. Licensed under the MIT license.

-; This is a file that compiles both in C and ASM to define values in a way that is guaranteed to be the same on both sides.
-; We use this to define the structure offsets that the ASM code uses.
-; By having equivalent C constants we can add checks to the C code to ensure they are correct.
-;
-; This is an ugly hack, but it works :-)
-; 
-; Due to the fact that the ARM assemblers use the C precompiler
-; the C files have to redefine EQU to nothing before including this file.
-; */
+ This is a file that is included in both C and ASM such that the values are the same on both sides.
+ We use the C preprocessor to set ASM constants, as we already need to use the C preprocessor for
+ symcryptasm processing (see scripts/symcryptasm_processor.py).
+ We use this to define the structure offsets that the ASM code uses.
+ By having equivalent C constants we can add checks to the C code to ensure they are correct.

+*/

-;const SIZE_T
-SymCryptModulusNdigitsOffsetAmd64           EQU     4;
+#if defined(SYMCRYPT_MASM)
+#define SET(_variable, _value) _variable EQU _value
+#elif defined(SYMCRYPT_GAS)
+#define SET(_variable, _value) .set _variable, _value
+#else // assume C
+#define SET(_variable, _value) const SIZE_T _variable = _value;
+#endif

-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetAmd64   EQU     32;
+SET(SymCryptModulusNdigitsOffsetAmd64,          4);
+SET(SymCryptModulusMontgomeryInv64OffsetAmd64,  32);
+SET(SymCryptModulusValueOffsetAmd64,            128);

-; const SIZE_T
-SymCryptModulusValueOffsetAmd64             EQU     128;
+SET(SymCryptModulusNdigitsOffsetX86,            4);
+SET(SymCryptModulusMontgomeryInv64OffsetX86,    24);
+SET(SymCryptModulusValueOffsetX86,              96);

+SET(SymCryptModulusNdigitsOffsetArm64,          4);
+SET(SymCryptModulusMontgomeryInv64OffsetArm64,  32);
+SET(SymCryptModulusValueOffsetArm64,            128);

+SET(SymCryptModulusNdigitsOffsetArm,            4);
+SET(SymCryptModulusMontgomeryInv64OffsetArm,    24);
+SET(SymCryptModulusValueOffsetArm,              96);

-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetX86             EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetX86     EQU     24;
-
-; const SIZE_T
-SymCryptModulusValueOffsetX86               EQU     96;
-
-
-
-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetArm64           EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetArm64   EQU     32;
-
-; const SIZE_T
-SymCryptModulusValueOffsetArm64             EQU     128;
-
-
-
-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetArm             EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetArm     EQU     24;
-
-; const SIZE_T
-SymCryptModulusValueOffsetArm               EQU     96;
-
-
-
-
-; /*
-    IF 0
-; */
-#undef EQU
+#if !defined(SYMCRYPT_MASM) && !defined(SYMCRYPT_GAS)
+// Preserve the definition of SET for use in symcryptasm processing
+#undef SET
+#endif

 #if SYMCRYPT_CPU_AMD64
 #define SYMCRYPT_CHECK_ASM_OFFSETS \
@ -89,14 +62,9 @@ SymCryptModulusValueOffsetArm               EQU     96;
    SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusNdigitsOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, nDigits ) );\
    SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusMontgomeryInv64OffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, tm.montgomery.inv64 ));\
    SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusValueOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, Divisor.Int.ti.fdef.uint32 ));\
-  
+
 #endif // CPU_*

 #if !defined( SYMCRYPT_CHECK_ASM_OFFSETS)
 #define SYMCRYPT_CHECK_ASM_OFFSETS
 #endif
-
-
-; /*
-    ENDIF
-; */
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -96,42 +96,148 @@ set(SOURCES_COMMON
    IEEE802_11SaeCustom.c
  )

+function(process_cppasm filepath outformat archdefine)
+    get_filename_component(fileextension ${filepath} EXT)
+    if(NOT fileextension STREQUAL .cppasm)
+        message(FATAL_ERROR "cppasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
+    endif()
+    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
+        message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
+    endif()
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
+        message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
+    endif()
+    get_filename_component(rootpath ${filepath} DIRECTORY)
+    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
+    string(TOUPPER ${outformat} outformatupper)
+    string(TOUPPER ${archdefine} archdefineupper)
+    string(FIND ${rootpath} ${CMAKE_CURRENT_BINARY_DIR} findindex) # check whether input is in the output directory
+    if(findindex EQUAL -1) # input in the source directory
+        set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
+        set(output_pass2 ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}/${filestem}-${outformat}.asm)
+    else() # input in the output directory
+        set(output_directory ${rootpath})
+        set(output_pass2 ${rootpath}/${filestem}.asm)
+    endif()
+
+    set(dbg_definition "")
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+        set(dbg_definition "-DDBG=1")
+    endif()
+
+    if(outformat STREQUAL gas)
+        # assume gas => GCC compatible C compiler
+        add_custom_command(
+            OUTPUT ${output_pass2}
+            COMMAND "${CMAKE_C_COMPILER}" -E -P -x c ${filepath} -o ${output_pass2}
+                -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
+                -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
+            MAIN_DEPENDENCY ${filepath}
+            DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
+            COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
+            VERBATIM)
+    elseif(outformat STREQUAL masm)
+        # assume masm => MSVC C compiler
+        add_custom_command(
+            OUTPUT ${output_pass2}
+            COMMAND "${CMAKE_C_COMPILER}" /EP /P /Fi${output_pass2} ${filepath}
+                -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
+                -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
+            MAIN_DEPENDENCY ${filepath}
+            DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
+            COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
+            VERBATIM)
+    endif()
+endfunction()
+
+function(process_symcryptasm filepath outformat archdefine)
+    get_filename_component(fileextension ${filepath} EXT)
+    if(NOT fileextension STREQUAL .symcryptasm)
+        message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
+    endif()
+    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
+        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
+    endif()
+    get_filename_component(rootpath ${filepath} DIRECTORY)
+    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
+    set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
+    set(output_directory ${CMAKE_CURRENT_BINARY_DIR}/${rootpath})
+    set(output_cppasm ${output_directory}/${filestem}-${outformat}.cppasm)
+
+    add_custom_command(
+        OUTPUT ${output_cppasm}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
+        COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
+        MAIN_DEPENDENCY ${filepath}
+        DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
+        COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
+        VERBATIM)
+
+    process_cppasm(${output_cppasm} ${outformat} ${archdefine})
+endfunction()
+
 if(NOT WIN32)
    list(APPEND SOURCES_COMMON linux/intrinsics.c)
-    list(APPEND SOURCES_COMMON linux/asmstubs.c)
 endif()

 if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
+        process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
+        process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
+
        list(APPEND SOURCES_COMMON
-            amd64/aesasm.asm
-            amd64/fdef_asm.asm
-            amd64/fdef_mulx.asm
-            amd64/fdef369_asm.asm
-            amd64/sha1asm.asm
-            amd64/wipe.asm)
+            amd64/aesasm-masm.asm
+            amd64/fdef_asm-masm.asm
+            amd64/fdef369_asm-masm.asm
+            amd64/fdef_mulx-masm.asm
+            amd64/wipe-masm.asm)
        set_source_files_properties(
-            amd64/aesasm.asm
-            amd64/fdef_asm.asm
-            amd64/fdef_mulx.asm
-            amd64/fdef369_asm.asm
-            amd64/sha1asm.asm
-            amd64/wipe.asm
+            amd64/aesasm-masm.asm
+            amd64/fdef_asm-masm.asm
+            amd64/fdef369_asm-masm.asm
+            amd64/fdef_mulx-masm.asm
+            amd64/wipe-masm.asm
            PROPERTY LANGUAGE ASM_MASM)
+
    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "X86")
+        process_cppasm(i386/fdef_asm.cppasm masm x86)
+
        list(APPEND SOURCES_COMMON
            i386/aesasm.asm
-            i386/fdef_asm.asm
-            i386/rc4asm.asm
-            i386/sha1asm.asm
+            i386/fdef_asm-masm.asm
            i386/wipe.asm)
        set_source_files_properties(
            i386/aesasm.asm
-            i386/fdef_asm.asm
-            i386/rc4asm.asm
-            i386/sha1asm.asm
+            i386/fdef_asm-masm.asm
            i386/wipe.asm
            PROPERTY LANGUAGE ASM_MASM)
+        set_source_files_properties(
+            i386/fdef_asm-masm.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/i386)
+    endif()
+else()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
+        process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
+        process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
+
+        list(APPEND SOURCES_COMMON
+            amd64/aesasm-gas.asm
+            amd64/fdef_asm-gas.asm
+            amd64/fdef369_asm-gas.asm
+            amd64/fdef_mulx-gas.asm
+            amd64/wipe-gas.asm)
+        set_source_files_properties(
+            amd64/aesasm-gas.asm
+            amd64/fdef_asm-gas.asm
+            amd64/fdef369_asm-gas.asm
+            amd64/fdef_mulx-gas.asm
+            amd64/wipe-gas.asm
+            PROPERTY LANGUAGE ASM)
    endif()
 endif()

--- a/lib/a_dispatch.c
+++ b/lib/a_dispatch.c
@ -22,7 +22,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
    SYMCRYPT_MOD_FUNCTIONS_FDEF_GENERIC,                // Handles any type of modulus
    SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY,             // Montgomery, only for odd parity-public moduli

-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64

    SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY,          // optimized for 384 and 576-bit moduli
    SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY256,          // Special faster code for 256-bit Montgomery moduli
@ -55,12 +55,12 @@ const UINT32 g_SymCryptModFnsMask = sizeof( g_SymCryptModFns ) - sizeof( g_SymCr
 //
 // Tweaking the selection & function tables allows different tradeoffs of performance vs codesize
 //
-SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] = 
+SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
 {
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
    // Mulx used for 257-512 and 577-... bits
    {('2M' << 16) + SymCryptModFntableMontgomery256,        0,                               256,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
-    {('xM' << 16) + SymCryptModFntableMontgomeryMulx,       SYMCRYPT_CPU_FEATURES_FOR_MULX,  512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, 
+    {('xM' << 16) + SymCryptModFntableMontgomeryMulx,       SYMCRYPT_CPU_FEATURES_FOR_MULX,  512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
    {('9M' << 16) + SymCryptModFntable369Montgomery,        0,                               384,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
    {('5M' << 16) + SymCryptModFntableMontgomery512,        0,                               512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
    {('9M' << 16) + SymCryptModFntable369Montgomery,        0,                               576,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
@ -118,9 +118,9 @@ SymCryptSizeofIntFromDigits( UINT32 nDigits )

 PSYMCRYPT_INT
 SYMCRYPT_CALL
-SymCryptIntCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptIntCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                    UINT32  nDigits )
 {
    return SymCryptFdefIntCreate( pbBuffer, cbBuffer, nDigits );
@ -138,8 +138,8 @@ SymCryptIntWipe( _Out_ PSYMCRYPT_INT piDst )

 VOID
 SYMCRYPT_CALL
-SymCryptIntCopy( 
-    _In_    PCSYMCRYPT_INT  piSrc, 
+SymCryptIntCopy(
+    _In_    PCSYMCRYPT_INT  piSrc,
    _Out_   PSYMCRYPT_INT   piDst )
 {
    SymCryptFdefIntCopy( piSrc, piDst );
@ -191,8 +191,8 @@ SymCryptIntDigitsizeOfObject( _In_ PCSYMCRYPT_INT piSrc )

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntCopyMixedSize( 
-    _In_    PCSYMCRYPT_INT  piSrc, 
+SymCryptIntCopyMixedSize(
+    _In_    PCSYMCRYPT_INT  piSrc,
    _Out_   PSYMCRYPT_INT   piDst )
 {
    return SymCryptFdefIntCopyMixedSize( piSrc, piDst );
@ -207,8 +207,8 @@ SymCryptIntBitsizeOfValue( _In_ PCSYMCRYPT_INT piSrc )

 VOID
 SYMCRYPT_CALL
-SymCryptIntSetValueUint32( 
-            UINT32          u32Src, 
+SymCryptIntSetValueUint32(
+            UINT32          u32Src,
    _Out_   PSYMCRYPT_INT   piDst )
 {
    SymCryptFdefIntSetValueUint32( u32Src, piDst );
@ -216,8 +216,8 @@ SymCryptIntSetValueUint32(

 VOID
 SYMCRYPT_CALL
-SymCryptIntSetValueUint64( 
-            UINT64          u64Src, 
+SymCryptIntSetValueUint64(
+            UINT64          u64Src,
    _Out_   PSYMCRYPT_INT   piDst )
 {
    SymCryptFdefIntSetValueUint64( u64Src, piDst );
@ -225,10 +225,10 @@ SymCryptIntSetValueUint64(

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntSetValue( 
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc, 
-                                SIZE_T                  cbSrc, 
-                                SYMCRYPT_NUMBER_FORMAT  format, 
+SymCryptIntSetValue(
+    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
+                                SIZE_T                  cbSrc,
+                                SYMCRYPT_NUMBER_FORMAT  format,
    _Out_                       PSYMCRYPT_INT           piDst )
 {
    return SymCryptFdefIntSetValue( pbSrc, cbSrc, format, piDst );
@ -236,10 +236,10 @@ SymCryptIntSetValue(

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntGetValue( 
-    _In_                        PCSYMCRYPT_INT          piSrc, 
-    _Out_writes_bytes_( cbDst ) PBYTE                   pbDst, 
-                                SIZE_T                  cbDst, 
+SymCryptIntGetValue(
+    _In_                        PCSYMCRYPT_INT          piSrc,
+    _Out_writes_bytes_( cbDst ) PBYTE                   pbDst,
+                                SIZE_T                  cbDst,
                                SYMCRYPT_NUMBER_FORMAT  format )
 {
    return SymCryptFdefIntGetValue( piSrc, pbDst, cbDst, format );
@ -496,9 +496,9 @@ SymCryptSizeofDivisorFromDigits( UINT32 nDigits )

 PSYMCRYPT_DIVISOR
 SYMCRYPT_CALL
-SymCryptDivisorCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptDivisorCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                    UINT32  nDigits )
 {
    return SymCryptFdefDivisorCreate( pbBuffer, cbBuffer, nDigits );
@ -514,8 +514,8 @@ SymCryptDivisorWipe( _Out_ PSYMCRYPT_DIVISOR pdObj )
 }

 VOID
-SymCryptDivisorCopy( 
-    _In_    PCSYMCRYPT_DIVISOR  pdSrc, 
+SymCryptDivisorCopy(
+    _In_    PCSYMCRYPT_DIVISOR  pdSrc,
    _Out_   PSYMCRYPT_DIVISOR   pdDst )
 {
    SymCryptFdefDivisorCopy( pdSrc, pdDst );
@ -585,9 +585,9 @@ SymCryptSizeofModulusFromDigits( UINT32 nDigits )

 PSYMCRYPT_MODULUS
 SYMCRYPT_CALL
-SymCryptModulusCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptModulusCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                    UINT32  nDigits )
 {
    return SymCryptFdefModulusCreate( pbBuffer, cbBuffer, nDigits );
@ -604,7 +604,7 @@ SymCryptModulusWipe( _Out_ PSYMCRYPT_MODULUS pmObj )

 VOID
 SymCryptModulusCopy(
-    _In_    PCSYMCRYPT_MODULUS  pmSrc, 
+    _In_    PCSYMCRYPT_MODULUS  pmSrc,
    _Out_   PSYMCRYPT_MODULUS   pmDst )
 {
    SymCryptFdefModulusCopy( pmSrc, pmDst );
@ -626,8 +626,8 @@ SymCryptModElementAllocate( _In_ PCSYMCRYPT_MODULUS pmMod )

 VOID
 SYMCRYPT_CALL
-SymCryptModElementFree( 
-    _In_    PCSYMCRYPT_MODULUS      pmMod,      
+SymCryptModElementFree(
+    _In_    PCSYMCRYPT_MODULUS      pmMod,
    _Out_   PSYMCRYPT_MODELEMENT    peObj )
 {
    SymCryptFdefModElementFree( pmMod, peObj );
@ -642,9 +642,9 @@ SymCryptSizeofModElementFromModulus( PCSYMCRYPT_MODULUS pmMod )

 PSYMCRYPT_MODELEMENT
 SYMCRYPT_CALL
-SymCryptModElementCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE               pbBuffer, 
-                                    SIZE_T              cbBuffer, 
+SymCryptModElementCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE               pbBuffer,
+                                    SIZE_T              cbBuffer,
                                    PCSYMCRYPT_MODULUS   pmMod )
 {
    return SymCryptFdefModElementCreate( pbBuffer, cbBuffer, pmMod );
@ -660,9 +660,9 @@ SymCryptModElementWipe(
 }

 VOID
-SymCryptModElementCopy( 
+SymCryptModElementCopy(
    _In_    PCSYMCRYPT_MODULUS      pmMod,
-    _In_    PCSYMCRYPT_MODELEMENT   peSrc, 
+    _In_    PCSYMCRYPT_MODELEMENT   peSrc,
    _Out_   PSYMCRYPT_MODELEMENT    peDst )
 {
    SymCryptFdefModElementCopy( pmMod, peSrc, peDst );
@ -671,7 +671,7 @@ SymCryptModElementCopy(
 VOID
 SymCryptModElementMaskedCopy(
    _In_    PCSYMCRYPT_MODULUS      pmMod,
-    _In_    PCSYMCRYPT_MODELEMENT   peSrc, 
+    _In_    PCSYMCRYPT_MODELEMENT   peSrc,
    _Out_   PSYMCRYPT_MODELEMENT    peDst,
            UINT32                  mask )
 {
@ -753,7 +753,7 @@ SymCryptModElementToInt(
    PCUINT32 pData;

    SYMCRYPT_ASSERT( piDst->nDigits >= pmMod->nDigits );
-    
+
    pData = SYMCRYPT_MOD_CALL( pmMod ) modPreGet( pmMod, peSrc, pbScratch, cbScratch );

    SymCryptFdefModElementToIntGeneric( pmMod, pData, piDst, pbScratch, cbScratch );
@ -762,17 +762,17 @@ SymCryptModElementToInt(
 SYMCRYPT_DISABLE_CFG
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptModElementSetValue( 
-    _In_reads_bytes_( cbSrc )       PCBYTE                  pbSrc, 
-                                    SIZE_T                  cbSrc, 
-                                    SYMCRYPT_NUMBER_FORMAT  format, 
+SymCryptModElementSetValue(
+    _In_reads_bytes_( cbSrc )       PCBYTE                  pbSrc,
+                                    SIZE_T                  cbSrc,
+                                    SYMCRYPT_NUMBER_FORMAT  format,
                                    PCSYMCRYPT_MODULUS      pmMod,
    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                    SIZE_T                  cbScratch )
 {
    SYMCRYPT_ERROR  scError;
-    
+
    scError = SymCryptFdefModElementSetValueGeneric( pbSrc, cbSrc, format, pmMod, peDst, pbScratch, cbScratch );

    if( scError == SYMCRYPT_NO_ERROR )
@ -785,11 +785,11 @@ SymCryptModElementSetValue(

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptModElementGetValue( 
+SymCryptModElementGetValue(
                                    PCSYMCRYPT_MODULUS      pmMod,
    _In_                            PCSYMCRYPT_MODELEMENT   peSrc,
-    _Out_writes_bytes_( cbDst )     PBYTE                   pbDst, 
-                                    SIZE_T                  cbDst, 
+    _Out_writes_bytes_( cbDst )     PBYTE                   pbDst,
+                                    SIZE_T                  cbDst,
                                    SYMCRYPT_NUMBER_FORMAT  format,
    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                    SIZE_T                  cbScratch )
@ -889,8 +889,8 @@ SymCryptModNeg(
 SYMCRYPT_DISABLE_CFG
 VOID
 SYMCRYPT_CALL
-SymCryptModElementSetValueUint32( 
-                                    UINT32                  value, 
+SymCryptModElementSetValueUint32(
+                                    UINT32                  value,
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
@ -903,8 +903,8 @@ SymCryptModElementSetValueUint32(

 VOID
 SYMCRYPT_CALL
-SymCryptModElementSetValueNegUint32( 
-                                    UINT32                  value, 
+SymCryptModElementSetValueNegUint32(
+                                    UINT32                  value,
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
@ -994,7 +994,7 @@ SymCryptCreateTrialDivisionContext( UINT32 nDigits )

 UINT32
 SYMCRYPT_CALL
-SymCryptIntFindSmallDivisor( 
+SymCryptIntFindSmallDivisor(
    _In_                            PCSYMCRYPT_TRIALDIVISION_CONTEXT    pContext,
    _In_                            PCSYMCRYPT_INT                      piSrc,
    _Out_writes_bytes_( cbScratch ) PBYTE                               pbScratch,
--- a/lib/amd64/aesasm.asm
+++ b/lib/amd64/aesasm.asm
--- a/lib/amd64/aesasm.symcryptasm
+++ b/lib/amd64/aesasm.symcryptasm
@ -0,0 +1,964 @@
+//
+//  aesasm.symcryptasm   Assembler code for fast AES on the amd64
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+// This code is derived from the AesFast implementation that
+// Niels Ferguson wrote from scratch for BitLocker during Vista.
+// That code is still in RSA32.
+//
+
+// This file has only been partially translated into symcryptasm, external function calls use the
+// generic symcryptasm registers to convert different calling conventions into using the fixed register
+// layout used in aesasm. It seems likely that changing which registers AES state will be kept in in
+// the macros could impact on performance.
+// In general we don't want to touch this code going forward; the vast majority of amd64 CPUs have aesni
+// and use the Xmm Aes codepaths.
+
+#include "symcryptasm_shared.cppasm"
+
+#include "symcrypt_version.inc"
+
+#define USE_BLOCK_FUNCTION 1    // Set to 1 to use block function, 0 to use block macro
+
+#if defined(SYMCRYPT_MASM)
+extern  SymCryptAesSboxMatrixMult:DWORD
+extern  SymCryptAesInvSboxMatrixMult:DWORD
+extern  SymCryptAesInvSbox:BYTE
+extern  SymCryptFatal:NEAR
+
+#elif defined(SYMCRYPT_GAS)
+
+#else
+#error Unknown target assembly
+#endif
+
+#if DBG
+SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR ))
+SET(SYMCRYPT_MAGIC_CONSTANT, (HEX(53316D76) + SYMCRYPT_CODE_VERSION)) // 0x53316D76 == 'S1mv'
+
+MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
+        mov     rax, [ptr + struct_magic_offset]
+        sub     rax, ptr
+        cmp     rax, SYMCRYPT_MAGIC_CONSTANT
+        jz      check_magic_label
+        mov     arg_1, HEX(6D616763) // 0x6D616763 == 'magc'
+        call    SymCryptFatal
+check_magic_label:
+MACRO_END()
+#else
+MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
+MACRO_END()
+#endif
+
+//
+// Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure.
+//
+
+// SYMCRYPT_AES_EXPANDED_KEY struct
+//         RoundKey        dq      2*N_ROUND_KEYS_IN_AESKEY dup (?)        //
+//         lastEncRoundKey dq      ?                                       // pointer to last enc round key
+//         lastDecRoundKey dq      ?                                       // pointer to last dec round key
+//         SYMCRYPT_MAGIC_FIELD
+// SYMCRYPT_AES_EXPANDED_KEY ends
+
+SET(N_ROUND_KEYS_IN_AESKEY, 29)
+SET(lastEncRoundKeyOffset, (29*16))
+SET(lastDecRoundKeyOffset, (29*16 + 8))
+SET(magicFieldOffset, (29*16 + 8 + 8))
+
+//
+// Shorthand for the 4 tables we will use
+// We always use r11 to point to the (inv) SboxMatrixMult tables
+//
+#define SMM0  (r11 +    0)
+#define SMM1  (r11 + 1024)
+#define SMM2  (r11 + 2048)
+#define SMM3  (r11 + 3072)
+
+#define ISMM0 (r11 +    0)
+#define ISMM1 (r11 + 1024)
+#define ISMM2 (r11 + 2048)
+#define ISMM3 (r11 + 3072)
+
+MACRO_START(ENC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for encryption
+        // plus a key addition from the key pointer
+        //
+        // input:block is in     eax, ebx, ecx, edx -  r11 points to AesSboxMatrixMult
+        // New state ends up in  eax, ebx, ecx, edx
+        // Used registers:       esi, edi, ebp, r8
+
+        //
+        // We can use the e<xx> registers for the movzx as the
+        // upper 32 bits are automatically set to 0. This saves
+        // prefix bytes
+        //
+        // We use 32-bit registers to store the state.
+        // We tried using 64-bit registers, but the extra shifts
+        // cost too much.
+        // Using 32-bit throughout makes the key xor more expensive
+        // but we avoid having to combine the 32-bit halves into
+        // 64 bit.
+        //
+
+        movzx   esi,al
+        mov     esi,[SMM0 + 4 * rsi]
+        movzx   edi,ah
+        shr     eax,16
+        mov     r8d,[SMM1 + 4 * rdi]
+        movzx   ebp,al
+        mov     ebp,[SMM2 + 4 * rbp]
+        movzx   edi,ah
+        mov     edi,[SMM3 + 4 * rdi]
+
+        movzx   eax,bl
+        xor     edi,[SMM0 + 4 * rax]
+        movzx   eax,bh
+        shr     ebx,16
+        xor     esi,[SMM1 + 4 * rax]
+        movzx   eax,bl
+        xor     r8d,[SMM2 + 4 * rax]
+        movzx   eax,bh
+        xor     ebp,[SMM3 + 4 * rax]
+
+        movzx   eax,cl
+        xor     ebp,[SMM0 + 4 * rax]
+        movzx   ebx,ch
+        shr     ecx,16
+        xor     edi,[SMM1 + 4 * rbx]
+        movzx   eax,cl
+        xor     esi,[SMM2 + 4 * rax]
+        movzx   ebx,ch
+        xor     r8d,[SMM3 + 4 * rbx]
+
+        movzx   eax,dl
+        xor     r8d,[SMM0 + 4 * rax]
+        movzx   ebx,dh
+        shr     edx,16
+        xor     ebp,[SMM1 + 4 * rbx]
+        movzx   eax,dl
+        xor     edi,[SMM2 + 4 * rax]
+        movzx   ebx,dh
+        xor     esi,[SMM3 + 4 * rbx]
+
+        mov     eax, [keyptr]
+        mov     ebx, [keyptr + 4]
+        xor     eax, esi
+        mov     ecx, [keyptr + 8]
+        xor     ebx, edi
+        mov     edx, [keyptr + 12]
+        xor     ecx, ebp
+        xor     edx, r8d
+MACRO_END()
+
+
+MACRO_START(DEC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for decryption
+        //
+        // input:block is in      eax, ebx, ecx, edx
+        //       r11 points to AesInvSboxMatrixMult
+        // New state ends up in   esi, edi, ebp, r8d
+
+        movzx   esi,al
+        mov     esi,[ISMM0 + 4 * rsi]
+        movzx   edi,ah
+        shr     eax,16
+        mov     edi,[ISMM1 + 4 * rdi]
+        movzx   ebp,al
+        mov     ebp,[ISMM2 + 4 * rbp]
+        movzx   eax,ah
+        mov     r8d,[ISMM3 + 4 * rax]
+
+        movzx   eax,bl
+        xor     edi,[ISMM0 + 4 * rax]
+        movzx   eax,bh
+        shr     ebx,16
+        xor     ebp,[ISMM1 + 4 * rax]
+        movzx   eax,bl
+        xor     r8d,[ISMM2 + 4 * rax]
+        movzx   eax,bh
+        xor     esi,[ISMM3 + 4 * rax]
+
+        movzx   eax,cl
+        xor     ebp,[ISMM0 + 4 * rax]
+        movzx   ebx,ch
+        shr     ecx,16
+        xor     r8d,[ISMM1 + 4 * rbx]
+        movzx   eax,cl
+        xor     esi,[ISMM2 + 4 * rax]
+        movzx   ebx,ch
+        xor     edi,[ISMM3 + 4 * rbx]
+
+        movzx   eax,dl
+        xor     r8d,[ISMM0 + 4 * rax]
+        movzx   ebx,dh
+        shr     edx,16
+        xor     esi,[ISMM1 + 4 * rbx]
+        movzx   eax,dl
+        xor     edi,[ISMM2 + 4 * rax]
+        movzx   ebx,dh
+        xor     ebp,[ISMM3 + 4 * rbx]
+
+        mov     eax, [keyptr]
+        mov     ebx, [keyptr + 4]
+        xor     eax, esi
+        mov     ecx, [keyptr + 8]
+        xor     ebx, edi
+        mov     edx, [keyptr + 12]
+        xor     ecx, ebp
+        xor     edx, r8d
+MACRO_END()
+
+MACRO_START(AES_ENCRYPT_MACRO, AesEncryptMacroLoopLabel)
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use (modified)
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+        // This macro is free to unroll the cipher completely, or to use a loop
+        // over r9
+        //
+
+        //
+        // xor in first round key
+        //
+        xor     eax,[r9]
+        xor     ebx,[r9+4]
+        xor     ecx,[r9+8]
+        xor     edx,[r9+12]
+
+        add     r9,32
+
+        // Do not unroll the loop at all because very few CPUs use this codepath so it's worth
+        // minimizing the binary size
+
+AesEncryptMacroLoopLabel:
+        // Block is eax, ebx, ecx, edx
+        // r9-16 points to next round key
+
+        ENC_MIX r9-16
+
+        cmp     r9,r10
+        lea     r9,[r9+16]
+        jc      AesEncryptMacroLoopLabel
+
+        //
+        // Now for the final round
+        // We use the fact that SboxMatrixMult[0] table is also
+        // an Sbox table if you use the second element of each entry.
+        //
+        // Result is in esi, edi, ebp, r8d
+        //
+
+        movzx   esi,al
+        movzx   esi,byte ptr[r11 + 1 + 4*rsi]
+        movzx   edi,ah
+        shr     eax,16
+        movzx   r8d,byte ptr[r11 + 1 + 4*rdi]
+        movzx   ebp,al
+        shl     r8d,8
+        movzx   ebp,byte ptr[r11 + 1 + 4*rbp]
+        shl     ebp,16
+        movzx   edi,ah
+        movzx   edi,byte ptr[r11 + 1 + 4*rdi]
+        shl     edi,24
+
+        movzx   eax,bl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        or      edi,eax
+        movzx   eax,bh
+        shr     ebx,16
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shl     eax,8
+        or      esi,eax
+        movzx   eax,bl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,bh
+        shl     eax,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        or      r8d,eax
+        shl     ebx,24
+        or      ebp,ebx
+
+        movzx   eax,cl
+        movzx   ebx,ch
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shr     ecx,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     ebx,8
+        or      ebp,eax
+        or      edi,ebx
+        movzx   eax,cl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,ch
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      esi,eax
+        or      r8d,ebx
+
+        movzx   eax,dl
+        movzx   ebx,dh
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shr     edx,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     ebx,8
+        or      r8d,eax
+        or      ebp,ebx
+        movzx   eax,dl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,dh
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      edi,eax
+        or      esi,ebx
+
+        //
+        // xor in final round key
+        //
+
+        xor     r8d,[r10+12]
+        xor     esi,[r10]
+        xor     edi,[r10+4]
+        xor     ebp,[r10+8]
+MACRO_END()
+
+MACRO_START(AES_DECRYPT_MACRO, AesDecryptMacroLoopLabel)
+        //
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        //
+        // xor in first round key
+        //
+        xor     eax,[r9]
+        xor     ebx,[r9+4]
+        xor     ecx,[r9+8]
+        xor     edx,[r9+12]
+
+        add     r9,32
+
+        // Do not unroll the loop at all because very few CPUs use this codepath so it's worth
+        // minimizing the binary size
+AesDecryptMacroLoopLabel:
+        // Block is eax, ebx, ecx, edx
+        // r9-16 points to next round key
+
+        DEC_MIX r9-16
+
+        cmp     r9,r10
+        lea     r9,[r9+16]
+        jc      AesDecryptMacroLoopLabel
+
+        //
+        // Now for the final round
+        // Result is in esi, edi, ebp, r8d
+        //
+
+        movzx   esi,al
+        movzx   esi,byte ptr[r12 + rsi]
+        movzx   edi,ah
+        shr     eax,16
+        movzx   edi,byte ptr[r12 + rdi]
+        movzx   ebp,al
+        shl     edi,8
+        movzx   ebp,byte ptr[r12 + rbp]
+        shl     ebp,16
+        movzx   eax,ah
+        movzx   r8d,byte ptr[r12 + rax]
+        shl     r8d,24
+
+        movzx   eax,bl
+        movzx   eax,byte ptr[r12 + rax]
+        or      edi,eax
+        movzx   eax,bh
+        shr     ebx,16
+        movzx   eax,byte ptr[r12 + rax]
+        shl     eax,8
+        or      ebp,eax
+        movzx   eax,bl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,bh
+        shl     eax,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        or      r8d,eax
+        shl     ebx,24
+        or      esi,ebx
+
+        movzx   eax,cl
+        movzx   ebx,ch
+        movzx   eax,byte ptr[r12 + rax]
+        shr     ecx,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     ebx,8
+        or      ebp,eax
+        or      r8d,ebx
+        movzx   eax,cl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,ch
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      esi,eax
+        or      edi,ebx
+
+        movzx   eax,dl
+        movzx   ebx,dh
+        movzx   eax,byte ptr[r12 + rax]
+        shr     edx,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     ebx,8
+        or      r8d,eax
+        or      esi,ebx
+        movzx   eax,dl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,dh
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      edi,eax
+        or      ebp,ebx
+
+        //
+        // xor in final round key
+        //
+
+        xor     esi,[r10]
+        xor     edi,[r10+4]
+        xor     ebp,[r10+8]
+        xor     r8d,[r10+12]
+MACRO_END()
+
+#if USE_BLOCK_FUNCTION
+
+        //
+        // We use a block function, the AES_ENCRYPT macro merely calls the function
+        //
+
+MACRO_START(AES_ENCRYPT, loopLabel)
+        call    SymCryptAesEncryptAsmInternal
+MACRO_END()
+
+MACRO_START(AES_DECRYPT, loopLabel)
+        call    SymCryptAesDecryptAsmInternal
+MACRO_END()
+
+//========================================
+//       SymCryptAesEncryptAsmInternal
+//
+//       Internal AES encryption routine with modified calling convention.
+//       This function has the exact same calling convention as the AES_ENCRYPT_MACRO
+
+FUNCTION_START(SymCryptAesEncryptAsmInternal, 0, 0)
+
+        AES_ENCRYPT_MACRO SymCryptAesEncryptAsmInternalLoop
+
+FUNCTION_END(SymCryptAesEncryptAsmInternal)
+
+//========================================
+//       SymCryptAesDecryptAsmInternal
+//
+//       Internal AES encryption routine with modified calling convention.
+//       This function has the exact same calling convention as the AES_DECRYPT_MACRO
+//
+
+FUNCTION_START(SymCryptAesDecryptAsmInternal, 0, 0)
+
+        AES_DECRYPT_MACRO SymCryptAesDecryptAsmInternalLoop
+
+FUNCTION_END(SymCryptAesDecryptAsmInternal)
+
+#else
+
+        //
+        // No block function, use the macro directly
+        //
+
+MACRO_START(AES_ENCRYPT, loopLabel)
+        AES_ENCRYPT_MACRO loopLabel
+MACRO_END()
+
+MACRO_START(AES_DECRYPT, loopLabel)
+        AES_DECRYPT_MACRO loopLabel
+MACRO_END()
+
+#endif
+
+//
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesEncrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbPlaintext,
+//                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbCiphertext )
+//
+
+NESTED_FUNCTION_START(SymCryptAesEncryptAsm, 3, 15)
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy it to
+        // the place it is needed internally in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
+        //      rcx, rdx, r8, rdi, rsi
+
+        mov     r10, [Q1 + lastEncRoundKeyOffset]
+        mov     r9, Q1
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
+
+        //
+        // Load the plaintext
+        //
+        mov     eax,[Q2     ]
+        mov     ebx,[Q2 +  4]
+        mov     ecx,[Q2 +  8]
+        mov     edx,[Q2 + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+        AES_ENCRYPT SymCryptAesEncryptAsmLoop
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+
+        // retrieve pbCiphertext using Q0 because it is always rax regardless of calling convention
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0     ], esi
+        mov     [Q0 +  4], edi
+        mov     [Q0 +  8], ebp
+        mov     [Q0 + 12], r8d
+
+NESTED_FUNCTION_END(SymCryptAesEncryptAsm)
+
+
+//
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesDecrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbCiphertext,
+//                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbPlaintext )
+
+NESTED_FUNCTION_START(SymCryptAesDecryptAsm, 3, 15)
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
+        //      rcx, rdx, r8, rdi, rsi
+
+        mov     r9,[Q1 + lastEncRoundKeyOffset]
+        mov     r10,[Q1 + lastDecRoundKeyOffset]
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
+
+        mov     eax,[Q2     ]
+        mov     ebx,[Q2 +  4]
+        mov     ecx,[Q2 +  8]
+        mov     edx,[Q2 + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
+        lea     r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
+
+        AES_DECRYPT SymCryptAesDecryptAsmLoop
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+
+        // retrieve pbPlaintext using Q0 because it is always rax regardless of calling convention
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0     ], esi
+        mov     [Q0 +  4], edi
+        mov     [Q0 +  8], ebp
+        mov     [Q0 + 12], r8d
+
+NESTED_FUNCTION_END(SymCryptAesDecryptAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCbcEncrypt(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCbcEncryptAsm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCbcEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15      // only deal with whole # blocks
+        jz      SymCryptAesCbcEncryptNoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2    // save pbChainingValue
+        mov     rax, Q2                 // rax = pbChainingValue
+        mov     r13, Q3                 // r13 = pbSrc
+
+        mov     r15, Q5                 // r15 = cbData
+        mov     r14, Q4                 // r14 = pbDst
+
+        add     r15, Q3                 // r15 = pbSrcEnd
+
+        mov     r10,[Q1 + lastEncRoundKeyOffset]    // r10 = last enc round key
+        mov     r12,Q1                              // r12 = first round key to use
+
+        //
+        // Load the chaining state from pbChainingValue
+        //
+        mov     esi,[rax     ]
+        mov     edi,[rax +  4]
+        mov     ebp,[rax +  8]
+        mov     r8d,[rax + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+ALIGN(16)
+SymCryptAesCbcEncryptAsmLoop:
+        // Loop register setup
+        // r10 = last round key to use
+        // r12 = first round key to use
+        // r13 = pbSrc
+        // r14 = pbDst
+        // r15 = pbSrcEnd
+
+        // chaining state in (esi,edi,ebp,r8d)
+
+        mov     eax, [r13]
+        mov     r9, r12
+        mov     ebx, [r13+4]
+        xor     eax, esi
+        mov     ecx, [r13+8]
+        xor     ebx, edi
+        xor     ecx, ebp
+        mov     edx, [r13+12]
+        xor     edx, r8d
+
+        add     r13, 16
+
+
+        AES_ENCRYPT SymCryptAesCbcEncryptAsmInnerLoop
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        mov     [r14], esi
+        mov     [r14+4], edi
+        mov     [r14+8], ebp
+        mov     [r14+12], r8d
+
+        add     r14, 16
+
+        cmp     r13, r15
+
+        jb      SymCryptAesCbcEncryptAsmLoop
+
+
+        //
+        // Update the chaining value
+        //
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0], esi
+        mov     [Q0+4], edi
+        mov     [Q0+8], ebp
+        mov     [Q0+12], r8d
+
+SymCryptAesCbcEncryptNoData:
+
+NESTED_FUNCTION_END(SymCryptAesCbcEncryptAsm)
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCbcDecrypt(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCbcDecryptAsm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCbcDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15
+        jz      SymCryptAesCbcDecryptNoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2   // save pbChainingValue
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], Q3   // save pbSrc
+
+        lea     r14, [Q5 - 16]
+        lea     r15, [Q4 + r14]         // r15 = pbDst pointed to last block
+        add     r14, Q3                 // r14 = pbSrc pointed to last block
+
+        mov     r13,[Q1 + lastEncRoundKeyOffset]
+        mov     r10,[Q1 + lastDecRoundKeyOffset]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
+        lea     r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
+
+        //
+        // Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
+        //
+        mov     eax,[r14]
+        mov     ebx,[r14+4]
+        mov     ecx,[r14+8]
+        mov     edx,[r14+12]
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)  ], eax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)+4], ebx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)  ], ecx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)+4], edx
+
+        jmp     SymCryptAesCbcDecryptAsmLoopEntry
+
+ALIGN(16)
+
+SymCryptAesCbcDecryptAsmLoop:
+        // Loop register setup
+        // r13 = first round key to use
+        // r14 = pbSrc
+        // r15 = pbDst
+        // [slot1] = pbSrcStart
+
+        // current ciphertext block (esi,edi,ebp,r8d)
+
+        mov     eax,[r14-16]
+        mov     ebx,[r14-12]
+        xor     esi,eax
+        mov     ecx,[r14-8]
+        xor     edi,ebx
+        mov     [r15],esi
+        mov     edx,[r14-4]
+        xor     ebp,ecx
+        mov     [r15+4],edi
+        xor     r8d,edx
+        mov     [r15+8],ebp
+        mov     [r15+12],r8d
+
+        sub     r14,16
+        sub     r15,16
+
+SymCryptAesCbcDecryptAsmLoopEntry:
+
+        mov     r9, r13
+
+        AES_DECRYPT SymCryptAesCbcDecryptAsmInnerLoop
+        //
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        cmp     r14, [rsp + GET_MEMSLOT_OFFSET(slot1)]  // pbSrc
+        ja      SymCryptAesCbcDecryptAsmLoop
+
+        mov     rbx,[rsp + GET_MEMSLOT_OFFSET(slot0)]   // pbChainingValue
+        xor     esi,[rbx]
+        xor     edi,[rbx+4]
+        xor     ebp,[rbx+8]
+        xor     r8d,[rbx+12]
+
+        mov     [r15], esi
+        mov     [r15+4], edi
+        mov     [r15+8], ebp
+        mov     [r15+12], r8d
+
+        //
+        // Update the chaining value to the last ciphertext block
+        //
+        mov     rax,[rsp + GET_MEMSLOT_OFFSET(slot2)]
+        mov     rcx,[rsp + GET_MEMSLOT_OFFSET(slot3)]
+        mov     [rbx], rax
+        mov     [rbx+8], rcx
+
+SymCryptAesCbcDecryptNoData:
+
+NESTED_FUNCTION_END(SymCryptAesCbcDecryptAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCtrMsb64(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCtrMsb64Asm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCtrMsb64AsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15      // only deal with whole # blocks
+        jz      SymCryptAesCtrMsb64NoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2   // save pbChainingState
+        mov     rax, Q2         // rax = pbChainingValue
+        mov     r13, Q3         // r13 = pbSrc
+        mov     r14, Q5         // r14 = cbData
+        mov     r15, Q4         // r15 = pbDst
+        add     r14, Q3         // r14 = cbData + pbSrc = pbSrcEnd
+
+        mov     r10,[Q1 + lastEncRoundKeyOffset]        // r10 = last enc round key
+        mov     r12,Q1                                  // r12 = first round key to use
+
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+        //
+        // Load the chaining state
+        //
+        mov     rcx, [rax + 8]
+        mov     rax, [rax    ]
+
+        //
+        // Store it in our local copy (we have no register free to keep pbChainingState in)
+        //
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], rcx
+
+        //
+        // Move to the right registers
+        //
+        mov     rbx, rax
+        mov     rdx, rcx
+        shr     rbx, 32
+        shr     rdx, 32
+
+ALIGN(16)
+SymCryptAesCtrMsb64AsmLoop:
+        // Loop invariant
+        // Current chaining state is in (eax, ebx, ecx, edx)
+        // r10 = last round key to use
+        // r11 = SboxMatrixMult
+        // r12 = first round key to use
+        // r13 = pbSrc
+        // r14 = pbSrcEnd
+        // r15 = pbDst
+        // [slot1..slot2] = 16 bytes chaining state block
+
+        mov     r9, r12
+
+        AES_ENCRYPT SymCryptAesCtrMsb64AsmInnerLoop
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        // To improve latency, we FIRST
+        // load the chaining state, increment the counter, and write it back.
+        // leave the state in the (eax, ebx, ecx, edx) registers
+
+        mov     eax,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 0]
+        mov     ebx,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 4]
+        mov     rcx,[rsp +  GET_MEMSLOT_OFFSET(slot2) ]
+        bswap   rcx
+        add     rcx, 1
+        bswap   rcx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2) ], rcx
+        mov     rdx, rcx
+        shr     rdx, 32
+
+        // THEN we process the XOR of the key stream with the data
+        // This order is faster as we need to have the chaining state done
+        // before we can proceed, but there are no dependencies on the data result
+        // So we can loop back to the beginning while the data stream read/writes are
+        // still in flight.
+        //
+        // xor with the source stream
+
+        xor     esi,[r13 + 0 ]
+        xor     edi,[r13 + 4 ]
+        xor     ebp,[r13 + 8 ]
+        xor     r8d,[r13 + 12]
+
+        // store at the destination
+
+        mov     [r15 + 0], esi
+        mov     [r15 + 4], edi
+        mov     [r15 + 8], ebp
+        mov     [r15 + 12], r8d
+
+        add     r13, 16     // pbSrc += 16
+        add     r15, 16     // pbDst += 16
+
+        cmp     r13, r14
+
+        jb      SymCryptAesCtrMsb64AsmLoop
+
+        //
+        // Copy back the chaining value - we only modified the last 8 bytes, so that is all we copy
+        //
+        mov     rsi,[rsp + GET_MEMSLOT_OFFSET(slot0)]   // pbChainingState
+        mov     [rsi + 8], ecx
+        mov     [rsi + 12], edx
+
+        //
+        // Wipe the chaining value on stack
+        //
+        xor     rax, rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], rax
+
+SymCryptAesCtrMsb64NoData:
+
+NESTED_FUNCTION_END(SymCryptAesCtrMsb64Asm)
+
+FILE_END()
--- a/lib/amd64/fdef369_asm.asm
+++ b/lib/amd64/fdef369_asm.asm
@ -1,529 +0,0 @@
-;
-;  fdef_369asm.asm   Assembler code for large integer arithmetic in the default data format
-;
-; This file contains alternative routines that are used for modular computations
-; where the modulus is 257-384 or 513-576 bits long.
-; (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
-;
-; The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
-;
-; Most of this code is a direct copy of the default code.
-; AMD64 digits are now 512 bits.
-; We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
-; are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-include ksamd64.inc
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-
-
-include C_asm_shared.inc
-
-; A digit consists of 4 words of 64 bits each
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdef369RawAddAsm(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
-;                                                            UINT32      nDigits );
-
-        LEAF_ENTRY SymCryptFdef369RawAddAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
-        
-        add     r9, 1
-        xor     rax, rax
-        xor     r10, r10
-
-        ; Cy = 0
-
-SymCryptFdef369RawAddAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        adc     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        adc     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-        
-        mov     rax,[rcx + 16]
-        adc     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-        
-        lea     rcx, [rcx + 24]
-        lea     rdx, [rdx + 24]
-        lea     r8,  [r8  + 24]
-        dec     r9d
-        jnz     SymCryptFdef369RawAddAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-                
-        ret
-        
-        LEAF_END SymCryptFdef369RawAddAsm, _TEXT
-
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdefRawSubAsm(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-;                                                            UINT32      nDigits );
-
-        LEAF_ENTRY SymCryptFdef369RawSubAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
- 
-        add     r9, 1
-        xor     rax, rax
-        xor     r10, r10
-
-SymCryptFdef369RawSubAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        sbb     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        sbb     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-        
-        mov     rax,[rcx + 16]
-        sbb     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-        
-        lea     rcx, [rcx + 24]
-        lea     rdx, [rdx + 24]
-        lea     r8,  [r8  + 24]
-        dec     r9d
-        jnz     SymCryptFdef369RawSubAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-                
-        ret
-        
-        LEAF_END SymCryptFdef369RawSubAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefMaskedCopy(
-;    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
-;    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask )
-
-    LEAF_ENTRY  SymCryptFdef369MaskedCopyAsm, _TEXT
-
-        add     r8d, 1
-        movsxd  r9, r9d     
-
-SymCryptFdef369MaskedCopyAsmLoop:
-        mov     rax, [rcx]
-        mov     r10, [rdx]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx], rax
-
-        mov     rax, [rcx + 8]
-        mov     r10, [rdx + 8]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx + 8], rax
-
-        mov     rax, [rcx + 16]
-        mov     r10, [rdx + 16]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx + 16], rax
-
-        ; Move on to the next digit
-
-        add     rcx, 24
-        add     rdx, 24
-        sub     r8d, 1
-        jnz     SymCryptFdef369MaskedCopyAsmLoop
-        ret
-
-        LEAF_END SymCryptFdef369MaskedCopyAsm, _TEXT
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-
-SymCryptFdef369RawMulAsm_Frame struct
-        SavedRbx        dq  ?
-        SavedRdi        dq  ?
-        SavedRsi        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        returnaddress   dq  ?
-        Arg1Home        dq  ?
-        Arg2Home        dq  ?
-        Arg3Home        dq  ?
-        Arg4Home        dq  ?
-        pDst            dq  ?
-
-SymCryptFdef369RawMulAsm_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdef369RawMulAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        rsi
-        push_reg        rdi
-
-        END_PROLOGUE
-
-        ; Basic structure:
-        ;   for each word in Src1:
-        ;       Dst += Src2 * word
-        ; Register assignments
-        ; 
-        ; rax = tmp for mul
-        ; rbx = word from Src1 to multiply with
-        ; rcx = pSrc1  (updated in outer loop)
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc2
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pSrc2
-        ; r9 = nDigits2
-        ; r10 = pDst (incremented in outer loop)
-        ; r11 = # words left from Src1 to process
-        ; r12 = carry
-        ; r13 = inner loop counter
-
-
-        add     edx, 1
-        add     r9d, 1
-        lea     r11d, [edx + 2*edx]  ; nDigits1 * 3 = # words in Src1 to process
-        mov     r10, [rsp + SymCryptFdef369RawMulAsm_Frame.pDst ]
-
-        ; Outer loop invariant established: rcx, r8, r9, r10
-
-
-        mov     rsi, r8             ; rsi = pSrc2
-        mov     rdi, r10            ; rdi = pDst + outer loop ctr
-        mov     rbx, [rcx]          ; mulword
-        xor     r12, r12
-        mov     r13d, r9d
-
-        ; First inner loop overwrites Dst, which avoids adding the current Dst value
-
-SymCryptFdef369RawMulAsmLoop1:
-        mov     rax, [rsi]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 8]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16], rax
-        mov     r12, rdx
-
-        add     rsi, 24
-        add     rdi, 24
-        sub     r13d,1
-        jnz     SymCryptFdef369RawMulAsmLoop1
-
-        mov     [rdi], rdx              ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11d, 1
-
-SymCryptFdef369RawMulAsmLoopOuter:
-
-        add     rcx, 8                  ; move to next word of pSrc1
-        add     r10, 8                  ; move Dst pointer one word over
-        mov     rbx, [rcx]
-        mov     rsi, r8
-        mov     rdi, r10
-        xor     r12, r12
-        mov     r13d, r9d
-
-SymCryptFdef369RawMulAsmLoop2:
-        mov     rax, [rsi]
-        mul     rbx
-        add     rax, [rdi]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 8]
-        mul     rbx
-        add     rax, [rdi + 8]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     rax, [rdi + 16]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16], rax
-        mov     r12, rdx
-
-        add     rsi, 24
-        add     rdi, 24
-        sub     r13d,1
-        jnz     SymCryptFdef369RawMulAsmLoop2
-
-        mov     [rdi], rdx          ; write next word. (stays within Dst buffer)
-
-        sub     r11d, 1
-        jnz     SymCryptFdef369RawMulAsmLoopOuter
-
-        BEGIN_EPILOGUE
-
-        pop     rdi
-        pop     rsi
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-               
-    NESTED_END      SymCryptFdef369RawMulAsm, _TEXT
-
-
-
-
-
-
-;VOID
-;SymCryptFdefMontgomeryReduceAsm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdef369MontgomeryReduceAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        mov     r11, rdx        ; r11 = pSrc
-        mov     ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64]                  ; nDigits
-        add     ebp, 1
-        mov     r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        lea     edi, [ebp + 2*ebp]  ; outer loop counter, in words
-
-        xor     r14d, r14d
-
-        ; General register allocations
-        ; rax = multiply result
-        ; rbx = multiplier in inner loop
-        ; rcx = pointer to modulus value
-        ; rdx = multiply result
-        ; rsi = loop counter
-        ; rdi = loop counter
-        ; rbp = nDigits
-        ; r8 = pDst
-        ; r9 = running pointer in Src
-        ; r10 = running pointer in Mod
-        ; r11 = pSrc (updated in outer loop)
-        ; r12 = carry
-        ; r13 = pmMod->tm.montgomery.inv64
-        ; r14 = carry out from last word of previous loop iteration
-
-
-SymCryptFdef369MontgomeryReduceAsmOuterLoop:
-
-        ; start decoder with a few simple instructions, including at least one that requires
-        ; a uop execution and is on the critical path
-
-        mov     rbx, [r11]                      ; fetch word of Src we want to set to zero
-        mov     r10, r11
-        mov     r9, rcx
-
-        imul    rbx, r13                        ; lower word is same for signed & unsigned multiply
-
-        mov     esi, ebp
-        xor     r12d, r12d
-
-SymCryptFdef369MontgomeryReduceAsmInnerloop:
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rcx = pointer to modulus value
-        ; rdx = mul scratch
-        ; edi = outer loop counter (words)
-        ; esi = inner loop counter (digits)
-        ; r9  = running ptr to modulus
-        ; r10 = running ptr to input/scratch
-        ; r12 = carry (64 bits)
-
-        mov     rax, [r9]
-        mul     rbx
-        add     rax, [r10]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10], rax
-        mov     r12, rdx
-
-        mov     rax, [r9 + 8]
-        mul     rbx
-        add     rax, [r10 + 8]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10 + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [r9 + 16]
-        mul     rbx
-        add     rax, [r10 + 16]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10 + 16], rax
-        mov     r12, rdx
-
-        add     r9, 24
-        add     r10, 24
-        sub     esi,1
-        jnz     SymCryptFdef369MontgomeryReduceAsmInnerloop
-
-        add     r12, r14
-        mov     r14d, 0
-        adc     r14, 0
-        add     r12, [r10]
-        adc     r14, 0
-        mov     [r10], r12
-
-        add     r11, 8
-
-        sub     edi, 1
-        jnz     SymCryptFdef369MontgomeryReduceAsmOuterLoop
-
-        ;
-        ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
-        ; 
-
-        ; First we compute the pSrc result minus the modulus into the destination
-        mov     esi, ebp        ; loop ctr
-        mov     r10, r11        ; pSrc
-        mov     r9, rcx         ; pMod
-        mov     r12, r8         ; pDst
-
-        ; Cy = 0 because the last 'sub edi,1' resulted in 0
-
-SymCryptFdef369MontgomeryReduceAsmSubLoop:
-        mov     rax,[r10]
-        sbb     rax,[r9]
-        mov     [r12], rax
-
-        mov     rax,[r10 + 8]
-        sbb     rax,[r9 + 8]
-        mov     [r12 + 8], rax
-
-        mov     rax,[r10 + 16]
-        sbb     rax,[r9 + 16]
-        mov     [r12 + 16], rax
-
-        lea     r10,[r10+24]
-        lea     r9, [r9 +24]
-        lea     r12,[r12+24]
-        dec     esi
-        jnz     SymCryptFdef369MontgomeryReduceAsmSubLoop
-
-        ; Finally a masked copy form pSrc to pDst 
-        ; copy if: r14 == 0 && Cy = 1
-        sbb     r14, 0              ; mask (64 bits)
-
-
-SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
-        mov     rax, [r11]
-        mov     rsi, [r8]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8], rax
-
-        mov     rax, [r11 + 8]
-        mov     rsi, [r8 + 8]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8 + 8], rax
-
-        mov     rax, [r11 + 16]
-        mov     rsi, [r8 + 16]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8 + 16], rax
-
-        ; Move on to the next digit
-
-        add     r11, 24
-        add     r8, 24
-        sub     ebp, 1
-        jnz     SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rdi
-        pop     rsi
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-               
-    NESTED_END      SymCryptFdef369MontgomeryReduceAsm, _TEXT
-
-        end
-
--- a/lib/amd64/fdef369_asm.symcryptasm
+++ b/lib/amd64/fdef369_asm.symcryptasm
@ -0,0 +1,451 @@
+//
+//  fdef_369asm.asm   Assembler code for large integer arithmetic in the default data format
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// This file contains alternative routines that are used for modular computations
+// where the modulus is 257-384 or 513-576 bits long.
+// (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
+//
+// The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
+//
+// Most of this code is a direct copy of the default code.
+// AMD64 digits are now 512 bits.
+// We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
+// are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+#include "symcryptasm_shared.cppasm"
+
+// A digit consists of 4 words of 64 bits each
+
+//UINT32
+//SYMCRYPT_CALL
+// SymCryptFdef369RawAddAsm(
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//     _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                             UINT32      nDigits )
+FUNCTION_START(SymCryptFdef369RawAddAsm, 4, 5)
+
+        inc     D4
+        xor     Q0, Q0
+
+SymCryptFdef369RawAddAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        adc     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        adc     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        adc     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        lea     Q1, [Q1 + 24]
+        lea     Q2, [Q2 + 24]
+        lea     Q3, [Q3 + 24]
+        dec     D4
+        jnz     SymCryptFdef369RawAddAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdef369RawAddAsm)
+
+// UINT32
+// SYMCRYPT_CALL
+// SymCryptFdef369RawSubAsm(
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//     _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                             UINT32      nDigits )
+
+FUNCTION_START(SymCryptFdef369RawSubAsm, 4, 5)
+
+        inc     D4
+        xor     Q0, Q0
+
+SymCryptFdef369RawSubAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        sbb     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        sbb     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        sbb     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        lea     Q1, [Q1 + 24]
+        lea     Q2, [Q2 + 24]
+        lea     Q3, [Q3 + 24]
+        dec     D4
+        jnz     SymCryptFdef369RawSubAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdef369RawSubAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369MaskedCopyAsm(
+//     _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
+//     _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
+//                                                                 UINT32      nDigits,
+//                                                                 UINT32      mask )
+
+FUNCTION_START(SymCryptFdef369MaskedCopyAsm, 4, 6)
+
+        inc     D3
+        movsxd  Q4, D4
+
+SymCryptFdef369MaskedCopyAsmLoop:
+        mov     Q0, [Q1]
+        mov     Q5, [Q2]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2], Q0
+
+        mov     Q0, [Q1 + 8]
+        mov     Q5, [Q2 + 8]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2 + 8], Q0
+
+        mov     Q0, [Q1 + 16]
+        mov     Q5, [Q2 + 16]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2 + 16], Q0
+
+        // Move on to the next digit
+
+        add     Q1, 24
+        add     Q2, 24
+        dec     D3
+        jnz     SymCryptFdef369MaskedCopyAsmLoop
+
+FUNCTION_END(SymCryptFdef369MaskedCopyAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369RawMulAsm(
+//     _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                     UINT32      nDigits1,
+//     _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                     UINT32      nDigits2,
+//     _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdef369RawMulAsm, 5, 11)
+
+        // Basic structure:
+        //   for each word in Src1:
+        //       Dst += Src2 * word
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = pSrc1  (updated in outer loop)
+        // D2 = # words left from Src1 to process
+        // Q3 = pSrc2
+        // Q4 = nDigits2
+        // Q5 = pDst (incremented in outer loop)
+        // Q6 = inner loop pointer into pSrc2
+        // Q7 = inner loop pointer into pDst
+        // Q8 = word from Src1 to multiply with
+        // Q9 = carry
+        // D10 = inner loop counter
+
+        inc     D2
+        inc     D4
+        lea     D2, [D2 + 2*D2]     // nDigits1 * 3 = # words in Src1 to process
+
+        // Outer loop invariant established: Q1, Q3, D4, Q5
+
+        mov     Q6, Q3              // Q6 = pSrc2
+        mov     Q7, Q5              // Q7 = pDst + outer loop ctr
+        mov     Q8, [Q1]            // mulword
+        xor     Q9, Q9
+        mov     D10, D4
+
+        // First inner loop overwrites Dst, which avoids adding the current Dst value
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoop1:
+        mov     Q0, [Q6]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 8]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 8], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 16]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 16], Q0
+        mov     Q9, QH
+
+        add     Q6, 24
+        add     Q7, 24
+        dec     D10
+        jnz     SymCryptFdef369RawMulAsmLoop1
+
+        mov     [Q7], QH                // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     D2
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoopOuter:
+
+        add     Q1, 8                   // move to next word of pSrc1
+        add     Q5, 8                   // move Dst pointer one word over
+        mov     Q8, [Q1]
+        mov     Q6, Q3
+        mov     Q7, Q5
+        xor     Q9, Q9
+        mov     D10, D4
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoop2:
+        mov     Q0, [Q6]
+        mul     Q8
+        add     Q0, [Q7]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 8]
+        mul     Q8
+        add     Q0, [Q7 + 8]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 8], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 16]
+        mul     Q8
+        add     Q0, [Q7 + 16]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 16], Q0
+        mov     Q9, QH
+
+        add     Q6, 24
+        add     Q7, 24
+        dec     D10
+        jnz     SymCryptFdef369RawMulAsmLoop2
+
+        mov     [Q7], QH           // write next word. (stays within Dst buffer)
+
+        dec     D2
+        jnz     SymCryptFdef369RawMulAsmLoopOuter
+
+MUL_FUNCTION_END(SymCryptFdef369RawMulAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369MontgomeryReduceAsm(
+//     _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//     _Inout_                         PUINT32                 pSrc,
+//     _Out_                           PUINT32                 pDst )
+
+MUL_FUNCTION_START(SymCryptFdef369MontgomeryReduceAsm, 3, 13)
+
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]                   // nDigits
+        inc     D4
+        mov     Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]          // inv64
+
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                     // modulus value
+
+        lea     D12, [D4 + 2*D4]  // outer loop counter, in words
+
+        xor     D8, D8
+
+        // General register allocations
+        // Q0 = multiply result
+        // QH = multiply result
+        // Q1 = pointer to modulus value
+        // Q2 = pSrc (updated in outer loop)
+        // Q3 = pDst
+        // D4 = nDigits
+        // Q5 = pmMod->tm.montgomery.inv64
+        // Q6 = multiplier in inner loop
+        // Q7 = carry
+        // Q8 = carry out from last word of previous loop iteration
+        // Q9 = running pointer in Src
+        // Q10 = running pointer in Mod
+        // D11 = loop counter
+        // D12 = outer loop counter (words)
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmOuterLoop:
+
+        // start decoder with a few simple instructions, including at least one that requires
+        // a uop execution and is on the critical path
+
+        mov     Q6, [Q2]                      // fetch word of Src we want to set to zero
+        mov     Q10, Q2
+        mov     Q9, Q1
+
+        imul    Q6, Q5                        // lower word is same for signed & unsigned multiply
+
+        mov     D11, D4
+        xor     D7, D7
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmInnerloop:
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // Q1 = pointer to modulus value
+        // Q6 = multiplier
+        // Q7 = carry (64 bits)
+        // Q9  = running ptr to modulus
+        // Q10 = running ptr to input/scratch
+        // D11 = inner loop counter (digits)
+        // D12 = outer loop counter (words)
+
+        mov     Q0, [Q9]
+        mul     Q6
+        add     Q0, [Q10]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10], Q0
+        mov     Q7, QH
+
+        mov     Q0, [Q9 + 8]
+        mul     Q6
+        add     Q0, [Q10 + 8]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10 + 8], Q0
+        mov     Q7, QH
+
+        mov     Q0, [Q9 + 16]
+        mul     Q6
+        add     Q0, [Q10 + 16]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10 + 16], Q0
+        mov     Q7, QH
+
+        add     Q9, 24
+        add     Q10, 24
+        dec     D11
+        jnz     SymCryptFdef369MontgomeryReduceAsmInnerloop
+
+        add     Q7, Q8
+        mov     D8, 0
+        adc     Q8, 0
+        add     Q7, [Q10]
+        adc     Q8, 0
+        mov     [Q10], Q7
+
+        add     Q2, 8
+
+        dec     D12
+        jnz     SymCryptFdef369MontgomeryReduceAsmOuterLoop
+
+        //
+        // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
+        //
+
+        // First we compute the pSrc result minus the modulus into the destination
+        mov     D11, D4         // loop ctr
+        mov     Q10, Q2         // pSrc
+        mov     Q9, Q1          // pMod
+        mov     Q7, Q3          // pDst
+
+        // Cy = 0 because the last 'adc Q8,0' resulted in 0, 1, or 2
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmSubLoop:
+        mov     Q0,[Q10]
+        sbb     Q0,[Q9]
+        mov     [Q7], Q0
+
+        mov     Q0,[Q10 + 8]
+        sbb     Q0,[Q9 + 8]
+        mov     [Q7 + 8], Q0
+
+        mov     Q0,[Q10 + 16]
+        sbb     Q0,[Q9 + 16]
+        mov     [Q7 + 16], Q0
+
+        lea     Q10,[Q10 + 24]
+        lea     Q9,[Q9 + 24]
+        lea     Q7,[Q7 + 24]
+
+        dec     D11
+        jnz     SymCryptFdef369MontgomeryReduceAsmSubLoop
+
+        // Finally a masked copy form pSrc to pDst
+        // copy if: Q8 == 0 && Cy = 1
+        sbb     Q8, 0              // mask (64 bits)
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
+        mov     Q0, [Q2]
+        mov     Q1, [Q3]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3], Q0
+
+        mov     Q0, [Q2 + 8]
+        mov     Q1, [Q3 + 8]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3 + 8], Q0
+
+        mov     Q0, [Q2 + 16]
+        mov     Q1, [Q3 + 16]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3 + 16], Q0
+
+        // Move on to the next digit
+
+        add     Q2, 24
+        add     Q3, 24
+        dec     D4
+        jnz     SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdef369MontgomeryReduceAsm)
+
+FILE_END()
--- a/lib/amd64/fdef_asm.asm
+++ b/lib/amd64/fdef_asm.asm
--- a/lib/amd64/fdef_asm.symcryptasm
+++ b/lib/amd64/fdef_asm.symcryptasm
--- a/lib/amd64/fdef_mul_macros.asm
+++ b/lib/amd64/fdef_mul_macros.asm
@ -1,224 +0,0 @@
-;
-; Macros for the multiplication routines in amd64
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-; General multiplication
-
-MULT_SINGLEADD_128   MACRO   index, src_reg, dst_reg
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     r15, rdx
-        add     rax, r12
-        mov     [dst_reg + 8*index], rax
-        adc     r15, 0
-
-        mov     rax, [src_reg + 8*(index+1)]
-        mul     rbx
-        mov     r12, rdx
-        add     rax, r15
-        mov     [dst_reg + 8*(index+1)], rax
-        adc     r12, 0
-
-    ENDM
-
-MULT_DOUBLEADD_128   MACRO   index, src_reg, dst_reg
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     r15, rdx
-        add     rax, [dst_reg + 8*index]
-        adc     r15, 0
-        add     rax, r12
-        mov     [dst_reg + 8*index], rax
-        adc     r15, 0
-
-        mov     rax, [src_reg + 8*(index+1)]
-        mul     rbx
-        mov     r12, rdx
-        add     rax, [dst_reg + 8*(index+1)]
-        adc     r12, 0
-        add     rax, r15
-        mov     [dst_reg + 8*(index+1)], rax
-        adc     r12, 0
-
-    ENDM
-
-; Squaring
-
-SQR_SINGLEADD_64   MACRO   index, src_reg, dst_reg, src_carry, dst_carry
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; src_carry = input carry
-        ; dst_carry = output carry
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     dst_carry, rdx
-        add     rax, src_carry
-        mov     [dst_reg + 8*index], rax
-        adc     dst_carry, 0
-
-    ENDM
-
-SQR_DOUBLEADD_64   MACRO   index, src_reg, dst_reg, src_carry, dst_carry
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; src_carry = input carry
-        ; dst_carry = output carry
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     dst_carry, rdx
-        add     rax, [dst_reg + 8*index]
-        adc     dst_carry, 0
-        add     rax, src_carry
-        mov     [dst_reg + 8*index], rax
-        adc     dst_carry, 0
-
-    ENDM
-
-SQR_SHIFT_LEFT MACRO index
-        mov     rax, [rdi + 8*index]
-        adc     rax, rax            ; Shift let and add the carry
-        mov     [rdi + 8*index], rax
-    ENDM
-
-SQR_DIAGONAL_PROP MACRO index
-        ;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Calculating the square
-        mov     rax, [rsi + 8*index]    ; mulword
-        mul     rax                     ; m^2
-
-        ; Adding the square to the even column
-        add     rax, [rdi + 16*index]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16*index], rax
-
-        ; Propagating the sum to the next column
-        mov     rax, rdx
-        xor     rdx, rdx
-
-        add     rax, [rdi + 16*index + 8]
-        adc     rdx, 0
-        mov     [rdi + 16*index + 8], rax
-        mov     r12, rdx
-    ENDM
-
-; Size-specific macros
-; A common prologue & epilogue between several functions allows jumping between them...
-
-MULT_COMMON_PROLOGUE    MACRO
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-
-        END_PROLOGUE
-    ENDM
-
-MULT_COMMON_EPILOGUE    MACRO
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        ret
-    ENDM
-
-
-MUL14   MACRO   Mult, pA, R0, R1, R2, R3, Cy
-        ; (R0, R1, R2, R3, rdx) = Mult * (A0..3) + (R0, R1, R2, R3)
-        ; Cy, rax = scratch
-
-        mov     rax, [pA]
-        mul     Mult
-        add     R0, rax
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 8]
-        mul     Mult
-        add     R1, rax
-        adc     rdx, 0
-        add     R1, Cy
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 16]
-        mul     Mult
-        add     R2, rax
-        adc     rdx, 0
-        add     R2, Cy
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 24]
-        mul     Mult
-        add     R3, rax
-        adc     rdx, 0
-        add     R3, Cy
-        adc     rdx, 0
-
-    ENDM
-
-; Macros for size-specific squaring
-
-SQR_DOUBLEADD_64_2  MACRO index
-        SQR_DOUBLEADD_64    (index),     rsi, rdi, r12, r15
-        SQR_DOUBLEADD_64    (index + 1), rsi, rdi, r15, r12
-    ENDM
-
-SQR_DOUBLEADD_64_4  MACRO index
-        SQR_DOUBLEADD_64_2  (index)
-        SQR_DOUBLEADD_64_2  (index + 2)
-    ENDM
-
-SQR_DOUBLEADD_64_8  MACRO index
-        SQR_DOUBLEADD_64_4  (index)
-        SQR_DOUBLEADD_64_4  (index + 4)
-    ENDM
-
-SQR_SIZE_SPECIFIC_INIT MACRO
-        lea     rcx, [rcx + 8]          ; move Src pointer 1 word over
-        lea     r10, [r10 + 16]         ; move Dst pointer 2 words over
-
-        mov     rsi, rcx                ; rsi = inner pSrc
-        mov     rdi, r10                ; rdi = inner pDst
-
-        mov     rbx, [rcx]              ; Get the next mulword
-        lea     rsi, [rsi + 8]          ; move Src pointer 1 word over
-    ENDM
--- a/lib/amd64/fdef_mulx.asm
+++ b/lib/amd64/fdef_mulx.asm
--- a/lib/amd64/fdef_mulx.symcryptasm
+++ b/lib/amd64/fdef_mulx.symcryptasm
--- a/lib/amd64/sha1asm.asm
+++ b/lib/amd64/sha1asm.asm
@ -1,423 +0,0 @@
-;
-; Sha1Asm.Asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;
-
-;
-;   This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
-;   for the x64 processor architecture.
-;
-;   This implementation is derived from the 32-bit one, which in turn is derived
-;   from an older one by Scott Field and Dan Shumow. 
-;
-
-include ksamd64.inc
-
-        TITLE   sha1asm.asm
-
-        ;
-        ; The four round constants used by SHA-1
-        ;
-        
-K0_19   EQU     05a827999H
-K20_39  EQU     06ed9eba1H
-K40_59  EQU     08f1bbcdcH
-K60_79  EQU     0ca62c1d6H
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptSha1AppendBlocks( _Inout_updates_( 5 )        PUINT32    H,
-;                            _In_reads_bytes_( cbData )    PCBYTE    pbData,
-;                                                     SIZE_T    cbData )
-;
-
-        ;
-        ; This function allocates stack space, so it is not a LEAF function
-        ; but a nested one.
-        ;
-        NESTED_ENTRY    SymCryptSha1AppendBlocksAsm, _TEXT
-                
-;
-; To keep stack manipulations simple we define a structure and use that for all accesses.
-;
-
-SymCryptSha1AppendBlocksFrame struct  16, NONUNIQUE
-;
-; To keep the RSP aligned we need (8 mod 16) bytes of local stack space. 
-; this is the case, so there is no need for a dummy location
-;
-Wbuf            dd      16 dup (?)
-EndAddress      dq      ?
-SaveR12         dq      ?
-SaveR13         dq      ?
-SaveR14         dq      ?
-SaveR15         dq      ?
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-
-SymCryptSha1AppendBlocksFrame ends
-
-        ;
-        ; We use the W buffer extensively; this is a shorthand for the base address
-        ;
-W       equ     rsp+SymCryptSha1AppendBlocksFrame.Wbuf
-
-
-
-        ;
-        ; Set up our stack frame and save non-volatile registers
-        ;
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r15
-        push_reg        r14
-        push_reg        r13
-        push_reg        r12
-        alloc_stack     SymCryptSha1AppendBlocksFrame.SaveR12
-        
-        END_PROLOGUE
-
-        ;
-        ;Register allocation:
-        ;
-        ;5 registers for state
-        ;2 scratch
-        ;6 registers for W[t-1], W[t-2], W[t-3], W[t-14], W[t-15], W[t-16]
-        ;1 for data pointer
-        ;1 for H pointer
-        ;
-        ;
-        ; To allow macro re-ordering of our registers we use symbolic names
-        ; for the registers.
-        ; s0-s4 are the 5 state registers. x1 and x2 are extra scratch registers.
-        ; w0-w5 contain the W state cache
-        ;
-        ; Note: some other code puts the right value in the right register and
-        ; has to be updated if this mapping is changed.
-        ;
-        ; a is in register (round   % 5)
-        ; b is in register (round+4 % 5)
-        ; c is in register (round+3 % 5)
-        ; d is in register (round+2 % 5)
-        ; e is in register (round+1 % 5)
-        ; This way, if round is incremented we move a->b, b->c, c->d, d->e, and e->a
-        ; For optimization the actual value of a is in scratch register x1 at the start of each round
-        ;
-        ; W[t- 1] is in register (round   % 6)
-        ; W[t- 2] is in register (round+5 % 6)
-        ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
-        ; W[t-14] is in register (round+3 % 6)
-        ; W[t-15] is in register (round+2 % 6)
-        ; W[t-16] is in register (round+1 % 6)
-        ; If round is incremented the values all appear in their right place.
-        
-s0      EQU     eax
-s1      EQU     ebx
-s2      EQU     ecx
-s3      EQU     edx
-s4      EQU     esi
-
-w0      EQU     r9d
-w1      EQU     r10d
-w2      EQU     r11d
-w3      EQU     r12d
-w4      EQU     r13d
-w5      EQU     r14d
-
-x1      EQU     ebp     ; screatch 1
-x2      EQU     edi     ; scratch 2
-
-dataPtr EQU     r8      ; Points to data buffer
-HPtr    EQU     r15     ; Points to H
-
-
-        ; At this point:
-        ;       rcx = H
-        ;       rdx = pbData
-        ;       r8  = cbData
-        ;
-        ; compute the end address, address of byte after last block we will process
-        ; This code ensures that we never exceed the data buffer we were given,
-        ; although we silently round the cbData parameter down to the next
-        ; multiple of 64.
-        ; Do nothing if no blocks need to be processed.
-        ;
-        and     r8,NOT 3fh                      ; round down to multiple of 64
-        jz      SymCryptSha1AppendBlocksDone
-        add     r8,rdx                          ; pbData + (cbData & 0x3f)
-        mov     [rsp+SymCryptSha1AppendBlocksFrame.EndAddress], r8
-
-        mov     dataPtr,rdx
-        mov     Hptr,rcx
-                
-        ;
-        ; Load the H state, note that the a value lives in x1 at the round code boundary
-        ;
-        mov     x1,[Hptr   ]
-        mov     s4,[Hptr+ 4]
-        mov     s3,[Hptr+ 8]
-        mov     s2,[Hptr+12]
-        mov     s1,[Hptr+16]
-        
-        
-SymCryptSha1AppendBlocksLoop:
-        ;
-        ; This is the main loop. We process 64 bytes in each iteration.
-        ;
-        ; Most of the code in the loop is generated through macros using parameters to
-        ; rename the registers.
-        ;
-        
-ROUND_CH_0_15   MACRO   round,sa,sb,sc,sd,se,wt,x1,x2
-        ;
-        ; Code for round 0-15.
-        ; This code loads data from the data buffer & BSWAPs the data to get it into the
-        ; right form.
-        ;
-        ; Parameters:
-        ; round round number
-        ; sa    register that will contain the a value
-        ; sb    register that contains the b value
-        ; sc    register that contains the c value
-        ; sd    register that contains the d value
-        ; se    register that contains the e value
-        ; x1    scratch, contains the a value on entry
-        ; x2    scratch register.
-        ; wt    register loaded with Wt
-        ; 
-        ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
-        ; We start with the d value as that is the oldest value and available the first
-        ;
-        ; See FIPS 180-2 for our symbolic notation.
-        ;
-        mov     x2,sd                   ; x2 = d
-        mov     wt,[dataPtr+4*round]    ; Fetch word from message
-        mov     sa, x1                  ; put a in the correct register
-
-        bswap   wt                      ; wt = Wt
-        xor     x2,sc                   ; x2 = (d ^ c)
-        rol     x1,5                    ; x1 = ROL(a,5)
-
-        add     se,wt                   ; se = e + Wt
-        and     x2,sb                   ; x2 = ((d ^ c) & b)
-        mov     [W + 4*round],wt        ; Store in W buffer for future use
-        ror     sb,2                    ; sb = ROL( b, 30 )
-
-        add     se,x1                   ; se = e + Wt + ROL(a,5)
-        xor     x2,sd                   ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        
-        lea     x1,[se+x2+K0_19]        ; x1 = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-                
-        ENDM
-
-MSG_EXP         MACRO   round, se, wa, wb, wc
-        ; round round number
-        ; se    register of state to add expanded message word to
-        ; wa    register of W[round-16], will be updated to contain W[round]
-        ; wb    register of W[round-14]
-        ; wc    register of W[round- 3], will be loaded with W[round-13]
-
-        xor     wc, wb                          ; wc = W[t-3] ^ W[t-14]
-        xor     wa,[W+4*((round-8) MOD 16)]     ; wa = W[t-16] ^ W[t-8]
-        xor     wa, wc                          ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     wa,1                            ; wa = Wt
-        IF      round LT (80 - 1)
-                ; do not load wc with W[t-13] in the last round; it will not be needed
-                mov     wc,[W+4*((round-13) MOD 16)]    ; wc = W[t-13]
-        ENDIF
-        add     se,wa                           ; re = e + Wt
-        IF      round LT (80 - 8)
-                ; don't store Wt in the last 8 rounds. The value would never be used
-                mov     [W+4*(round MOD 16)], wa; Store Wt
-        ENDIF
-        ENDM
-
-ROUND_CH        MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
-        ;
-        ; See ROUND_CH_0_15 for most parameters.
-        ; x1 and x2 are both scratch registers
-        ; wa    register of W[round-16], will be updated to contain W[round]
-        ; wb    register of W[round-14]
-        ; wc    register of W[round- 3], will be loaded with W[round-13]
-        ;
-        
-        xor     wc, wb                          ; wc = W[t-3] ^ W[t-14]
-        xor     wa,[W+4*((round-8) MOD 16)]     ; wa = W[t-16] ^ W[t-8]
-        xor     wa, wc                          ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     wa,1                            ; wa = Wt
-        mov     wc,[W+4*((round-13) MOD 16)]    ; wc = W[t-13]
-        add     se,wa                           ; re = e + Wt
-        mov     [W+4*(round MOD 16)], wa        ; Store Wt
-        
-        mov     sa, x1                          ; put a in the correct register
-        mov     x2,sd                           ; x2 = d
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,sc                           ; x2 = (d ^ c)
-        add     se,x1                           ; re = e + Wt + ROL(a,5)
-        and     x2,sb                           ; x2 = ((d ^ c) & b)
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        xor     x2,sd                           ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        lea     x1,[se+x2+K0_19]                ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-        ENDM
-
-ROUND_PARITY    MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2, K
-        ;
-        ; See ROUND_CH for most parameters
-        ; K is the round constant to use.
-        ;
-        ; The order of xorring the registers b, c, and d is driven by the data dependency graph.
-        ; We start with d (the oldest) and then do b to unblock the subsequent rotate
-        ;
-        MSG_EXP         round, se, wa, wb, wc   ; re = e + Wt
-
-        mov     sa,x1                           ; store a value in right register
-        rol     x1,5                            ; x1 = ROL(a,5)
-        add     se,x1                           ; re = e + Wt + ROL(a,5)
-        
-        mov     x2,sd                           ; x1 = d
-        xor     x2,sb                           ; x1 = (d ^ b)
-        xor     x2,sc                           ; x1 = (d ^ b ^ c) = Parity(b,c,d)
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        lea     x1,[se+x2+K]                    ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
-
-                ENDM
-
-ROUND_MAJ       MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
-        ;
-        ; See above for parameter explanation
-        ;
-        MSG_EXP         round, se, wa, wb, wc   ; re = e + Wt
-        
-        mov     sa,x1                           ; store a value in right register
-        rol     x1,5                            ; x1 = ROL(a,5)
-        add     se,x1                           ; re = e + ROL(a,5)
-        mov     x1,sd                           ; x1 = d
-        or      x1,sc                           ; x1 = (d | c)
-        and     x1,sb                           ; x1 = ((d | c) & b)
-
-        mov     x2,sc                           ; x2 = c
-        and     x2,sd                           ; x2 = (c & d)
-        or      x1,x2                           ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
-        
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        
-        lea     x1,[se+x1+K40_59]               ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
-        ENDM
-
-
-        ;
-        ; With these macros we can now produce the actual code.
-        ; Note the use of the % operator which evaluates the expression and yields the result as text.
-        ; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
-        ; for each round.
-        ;
-        ; The first 16 rounds are more complicated as we need to use the right registers to load the msg in
-        ; so we do those by hand
-        ;
-        ; W[t- 1] is in register (round   % 6)
-        ; W[t- 2] is in register (round+5 % 6)
-        ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
-        ; W[t-14] is in register (round+3 % 6)
-        ; W[t-15] is in register (round+2 % 6)
-        ; W[t-16] is in register (round+1 % 6)
-        ;
-        ROUND_CH_0_15    0, s0, s4, s3, s2, s1, w5, x1, x2      ;W[t-16] for t=16 is in w5
-        ROUND_CH_0_15    1, s1, s0, s4, s3, s2, w0, x1, x2      ;W[t-15] for t=16 is in w0
-        ROUND_CH_0_15    2, s2, s1, s0, s4, s3, w1, x1, x2      ;W[t-14] for t=16 is in w1
-        ROUND_CH_0_15    3, s3, s2, s1, s0, s4, w3, x1, x2      ;
-        ROUND_CH_0_15    4, s4, s3, s2, s1, s0, w4, x1, x2      ;
-        ROUND_CH_0_15    5, s0, s4, s3, s2, s1, w3, x1, x2      ;
-        ROUND_CH_0_15    6, s1, s0, s4, s3, s2, w4, x1, x2      ;
-        ROUND_CH_0_15    7, s2, s1, s0, s4, s3, w3, x1, x2      ;
-        ROUND_CH_0_15    8, s3, s2, s1, s0, s4, w4, x1, x2      ;
-        ROUND_CH_0_15    9, s4, s3, s2, s1, s0, w3, x1, x2      ;
-        ROUND_CH_0_15   10, s0, s4, s3, s2, s1, w4, x1, x2      ;
-        ROUND_CH_0_15   11, s1, s0, s4, s3, s2, w3, x1, x2      ;
-        ROUND_CH_0_15   12, s2, s1, s0, s4, s3, w4, x1, x2      ;
-        ROUND_CH_0_15   13, s3, s2, s1, s0, s4, w2, x1, x2      ;W[t-3] for t=16 is in w2
-        ROUND_CH_0_15   14, s4, s3, s2, s1, s0, w3, x1, x2      ;W[t-2] for t=16 is in w3
-        ROUND_CH_0_15   15, s0, s4, s3, s2, s1, w4, x1, x2      ;W[t-1] for t=16 is in w4
-
-        
-        FOR     t, <16, 17, 18, 19>
-          ROUND_CH      t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
-        ENDM
-        
-        FOR     t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
-          ROUND_PARITY  t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K20_39
-        ENDM
-
-        FOR     t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
-          ROUND_MAJ     t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
-        ENDM
-
-        FOR     t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79>
-          ROUND_PARITY  t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K60_79
-        ENDM
-        
-        ;
-        ; Now we update the state, & the dataPtr
-        ;
-        add     x1,[Hptr   ]
-        add     s4,[Hptr+ 4]
-        add     dataPtr,64
-        add     s3,[Hptr+ 8]
-        add     s2,[Hptr+12]
-        add     s1,[Hptr+16]
-        
-        mov     [Hptr   ], x1
-        mov     [Hptr+ 4], s4
-        cmp     dataPtr,[rsp+SymCryptSha1AppendBlocksFrame.EndAddress]  ; Loop terminating condition
-        mov     [Hptr+ 8], s3
-        mov     [Hptr+12], s2
-        mov     [Hptr+16], s1
-
-        jc      SymCryptSha1AppendBlocksLoop            ; Main loop
-        
-        ;
-        ; We're done processing the blocks. The result is already in the state, so all we have to do
-        ; is clean up.
-        ;
-        ; Wipe the W buffer
-        ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
-        ;
-        mov     rcx,64
-        xor     rax,rax
-@@:     sub     ecx,16
-        mov     [rsp+rcx  ],rax
-        mov     [rsp+rcx+8],rax
-        jnz     @B
-        
-SymCryptSha1AppendBlocksDone:   
-
-
-        add     rsp, SymCryptSha1AppendBlocksFrame.SaveR12
-
-        BEGIN_EPILOGUE
-        pop     r12
-        pop     r13
-        pop     r14
-        pop     r15
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-
-        ret
-        
-        NESTED_END      SymCryptSha1AppendBlocksAsm, _TEXT
-
-END
-
--- a/lib/amd64/symcrypt_magic.inc
+++ b/lib/amd64/symcrypt_magic.inc
@ -1,37 +0,0 @@
-;
-; SymCrypt_magic.inc
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-; Include file to define the support macros for the Magic field
-;
-
-        extern  SymCryptFatal:NEAR
-
-
-SYMCRYPT_MAGIC_FIELD    MACRO
-
-        if      DBG
-        magic   dq      ?
-        endif
-        
-                        ENDM
-
-SYMCRYPT_CODE_VERSION   EQU     ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR )
-SYMCRYPT_MAGIC_CONSTANT EQU     ('S1mv' + SYMCRYPT_CODE_VERSION)
-
-SYMCRYPT_CHECK_MAGIC    MACRO   ptr, struct_name
-
-        if      DBG
-
-        mov     rax, [ptr + struct_name.magic]
-        sub     rax, ptr
-        cmp     rax, SYMCRYPT_MAGIC_CONSTANT
-        jz      @F
-        mov     ecx, 'magc'
-        call    SymCryptFatal
-@@:
-        endif
-
-        ENDM
-        
-
--- a/lib/amd64/wipe.asm
+++ b/lib/amd64/wipe.asm
@ -1,171 +0,0 @@
-;
-; Wipe.asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-include ksamd64.inc
-
-        TITLE   wipe.asm
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptWipe( _Out_writes_bytes_( cbData )   PVOID  pbData,
-;                                       SIZE_T cbData )
-
-        ;
-        ; This function allocates no stack space, calls no functions, and does not save
-        ; any non-volatile registers. Thusm it is a LEAF function
-        ;
-        LEAF_ENTRY      SymCryptWipeAsm, _TEXT
-
-        ; rcx = pbData
-        ; rdx = cbData
-
-        ;       
-        ; This function will handle any alignment of pbData and any size, but it is optimized for 
-        ; the case where the start and end of the buffer are 16-aligned.
-        ; 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
-        ; of 16 long without adding too much slack. 
-        ; The cost of non-alignment is relatively low, in the order of 5 cycles or so
-        ;
-
-        xorps   xmm0,xmm0               ; Zero register for 16-byte wipes
-        cmp     rdx,16
-        jb      SymCryptWipeAsmSmall    ; if cbData < 16, this is a rare case
-        
-        test    rcx,15
-        jnz     SymCryptWipeAsmUnAligned; if data pointer is unaligned, we jump to the code that aligns the pointer
-                                        ; For well-optimized callers the aligned case is the common one, and that is
-                                        ; the fall-through.
-        
-SymCryptWipeAsmAligned:
-        ;
-        ; Here rcx is aligned, and rdx contains the # bytes left to wipe, and rdx >= 16
-        ;
-        ; Our loop wipes in 32-byte increments; we always wipe the first 16 bytes if
-        ; and increment the pbData pointer if cbData is 16 mod 32
-        ; This avoids a conditional jump and is faster.
-        ;
-        test    rdx,16                  
-        movaps  [rcx],xmm0              ; it is safe to always wipe as cbData >= 16   
-        lea     r8,[rcx+16]             
-        cmovnz  rcx,r8                  ; only increment pbData if cbData = 16 mod 32
-        
-        sub     rdx,32                  ; see if we have >= 32 bytes to wipe
-        jc      SymCryptWipeAsmTailOptional ; if not, wipe tail, or nothing if cbData = 0 mod 16
-        
-        align   16
-        
-SymCryptWipeAsmLoop:
-        movaps  [rcx],xmm0
-        movaps  [rcx+16],xmm0           ; Wipe 32 bytes
-        add     rcx,32
-        sub     rdx,32               
-        jnc     SymCryptWipeAsmLoop
-        
-SymCryptWipeAsmTailOptional:
-        ; only the lower 4 bits of rdx are valid, we have subtracted too much already.
-        ; The wipe was at least 16 bytes, so we can just wipe the tail in one instruction
-        
-        and     edx,15
-        jnz     SymCryptWipeAsmTail
-        ret
-        
-SymCryptWipeAsmTail:
-        ; This code appears also below at the end of the unaligned wiping routine
-        ; but making the jnz jump further is slower and we only duplicate 4 instructions.
-        xor     eax,eax
-        mov     [rcx+rdx-16],rax
-        mov     [rcx+rdx-8],rax       
-        ret
-
-        align   4
-SymCryptWipeAsmUnaligned:
-
-        ;
-        ; At this point we know that cbData(rdx) >= 16 and pbData(rcx) is unaligned. 
-        ; We can wipe 16 bytes and move to an aligned position
-        ; 
-        xor     eax,eax
-        mov     [rcx],rax
-        mov     [rcx+8],rax
-        
-        mov     eax,ecx                 ; 
-        neg     eax                     ; lower 4 bits of eax = # bytes to wipe to reach alignment
-        and     eax,15
-        add     rcx,rax
-        sub     rdx,rax
-
-        ;
-        ; If rdx > 16, go to the aligned wiping loop
-        ;        
-        cmp     rdx,16
-        jae      SymCryptWipeAsmAligned  ; if cbData >= 16, do aligned wipes
-        
-        ;
-        ; We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
-        ; We just wipe the last 16 bytes completely.
-        ;
-        xor     eax,eax
-        mov     [rcx+rdx-16],rax
-        mov     [rcx+rdx-8],rax       
-        ret
-        
-
-        align   8
-SymCryptWipeAsmSmall:
-        ; rcx = pbData, possibly unaligned
-        ; rdx = cbData; rdx < 16
-        ;
-        ; With speculative execution attacks, the cost of a jump table is prohibitive.
-        ; We use a compare ladder for 5 cases:
-        ;       8-15 bytes
-        ;       4-7 bytes
-        ;       2-3 bytes
-        ;       1 byte
-        ;       0 bytes
-        
-        xor     eax,eax
-
-        cmp     edx, 8
-        jb      SymCryptWipeAsmSmallLessThan8
-
-        ; wipe 8-15 bytes using two possibly overlapping writes
-        mov     [rcx], rax
-        mov     [rcx + rdx - 8], rax
-        ret
-
-SymCryptWipeAsmSmallLessThan8:
-        cmp     edx, 4
-        jb      SymCryptWipeAsmSmallLessThan4
-
-        ; wipe 4-7 bytes
-        mov     [rcx], eax
-        mov     [rcx + rdx - 4], eax
-        ret
-
-SymCryptWipeAsmSmallLessThan4:
-        cmp     edx, 2
-        jb      SymCryptWipeAsmSmallLessThan2
-
-        ; wipe 2-3 bytes
-         mov    [rcx], ax
-         mov    [rcx + rdx - 2], ax
-         ret
-
-SymCryptWipeAsmSmallLessThan2:
-        or      edx, edx
-        jz      SymCryptWipeAsmSmallDone
-
-        ; wipe 1 byte
-        mov     [rcx], al
-
-SymCryptWipeAsmSmallDone:
-
-        ret                 
-
-        LEAF_END        SymCryptWipeAsm, _TEXT
-       
-END
-
--- a/lib/amd64/wipe.symcryptasm
+++ b/lib/amd64/wipe.symcryptasm
@ -0,0 +1,165 @@
+//
+//  wipe.symcryptasm   Assembler code for wiping a buffer
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+
+#include "symcryptasm_shared.cppasm"
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptWipe( _Out_writes_bytes_( cbData )    PVOID  pbData,
+//                                              SIZE_T cbData )
+
+FUNCTION_START(SymCryptWipeAsm, 2, 4)
+
+        // Q1 = pbData
+        // Q2 = cbData
+
+        //
+        // This function will handle any alignment of pbData and any size, but it is optimized for
+        // the case where the start and end of the buffer are 16-aligned.
+        // 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
+        // of 16 long without adding too much slack.
+        // The cost of non-alignment is relatively low, in the order of 5 cycles or so
+        //
+
+        xorps   xmm0,xmm0               // Zero register for 16-byte wipes
+        cmp     Q2,16
+        jb      SymCryptWipeAsmSmall    // if cbData < 16, this is a rare case
+
+        test    Q1,15
+        jnz     SymCryptWipeAsmUnaligned // if data pointer is unaligned, we jump to the code that aligns the pointer
+                                        // For well-optimized callers the aligned case is the common one, and that is
+                                        // the fall-through.
+
+SymCryptWipeAsmAligned:
+        //
+        // Here Q1 is aligned, and Q2 contains the # bytes left to wipe, and Q2 >= 16
+        //
+        // Our loop wipes in 32-byte increments; we always wipe the first 16 bytes
+        // and increment the pbData pointer if cbData is 16 mod 32
+        // This avoids a conditional jump and is faster.
+        //
+        test    Q2,16
+        movaps  [Q1],xmm0               // it is safe to always wipe as cbData >= 16
+        lea     Q3,[Q1+16]
+        cmovnz  Q1,Q3                   // only increment pbData if cbData = 16 mod 32
+
+        sub     Q2,32                   // see if we have >= 32 bytes to wipe
+        jc      SymCryptWipeAsmTailOptional // if not, wipe tail, or nothing if cbData = 0 mod 16
+
+ALIGN(16)
+
+SymCryptWipeAsmLoop:
+        movaps  [Q1],xmm0
+        movaps  [Q1+16],xmm0            // Wipe 32 bytes
+        add     Q1,32
+        sub     Q2,32
+        jnc     SymCryptWipeAsmLoop
+
+SymCryptWipeAsmTailOptional:
+        // only the lower 4 bits of Q2 are valid, we have subtracted too much already.
+        // The wipe was at least 16 bytes, so we can just wipe the tail with 2 instructions
+
+        and     D2,15
+        jnz     SymCryptWipeAsmTail
+        ret
+
+SymCryptWipeAsmTail:
+        // This code appears also below at the end of the unaligned wiping routine
+        // but making the jnz jump further is slower and we only duplicate 4 instructions.
+        xor     D0,D0
+        mov     [Q1+Q2-16],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+ALIGN(4)
+
+SymCryptWipeAsmUnaligned:
+
+        //
+        // At this point we know that cbData(Q2) >= 16 and pbData(Q1) is unaligned.
+        // We can wipe 16 bytes and move to an aligned position
+        //
+        xor     D0,D0
+        mov     [Q1],Q0
+        mov     [Q1+8],Q0
+
+        mov     D0,D1
+        neg     D0                      // lower 4 bits of D0 = # bytes to wipe to reach alignment
+        and     D0,15
+        add     Q1,Q0
+        sub     Q2,Q0
+
+        //
+        // If Q2 > 16, go to the aligned wiping loop
+        //
+        cmp     Q2,16
+        jae     SymCryptWipeAsmAligned  // if cbData >= 16, do aligned wipes
+
+        //
+        // We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
+        // We just wipe the last 16 bytes completely.
+        //
+        xor     D0,D0
+        mov     [Q1+Q2-16],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+ALIGN(8)
+
+SymCryptWipeAsmSmall:
+        // Q1 = pbData, possibly unaligned
+        // Q2 = cbData; Q2 < 16
+        //
+        // With speculative execution attacks, the cost of a jump table is prohibitive.
+        // We use a compare ladder for 5 cases:
+        //       8-15 bytes
+        //       4-7 bytes
+        //       2-3 bytes
+        //       1 byte
+        //       0 bytes
+
+        xor     D0,D0
+
+        cmp     D2, 8
+        jb      SymCryptWipeAsmSmallLessThan8
+
+        // wipe 8-15 bytes using two possibly overlapping writes
+        mov     [Q1],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+SymCryptWipeAsmSmallLessThan8:
+        cmp     D2, 4
+        jb      SymCryptWipeAsmSmallLessThan4
+
+        // wipe 4-7 bytes
+        mov     [Q1],D0
+        mov     [Q1+Q2-4],D0
+        ret
+
+SymCryptWipeAsmSmallLessThan4:
+        cmp     D2, 2
+        jb      SymCryptWipeAsmSmallLessThan2
+
+        // wipe 2-3 bytes
+        mov     [Q1],W0
+        mov     [Q1+Q2-2],W0
+        ret
+
+SymCryptWipeAsmSmallLessThan2:
+        or      D2,D2
+        jz      SymCryptWipeAsmSmallDone
+
+        // wipe 1 byte
+        mov     [Q1],B0
+
+SymCryptWipeAsmSmallDone:
+
+FUNCTION_END(SymCryptWipeAsm)
+
+FILE_END()
--- a/lib/arm/fdef_asm.asm
+++ b/lib/arm/fdef_asm.asm
@ -9,7 +9,11 @@
 #include "symcrypt_version.inc"
 #include "symcrypt_magic.inc"

+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM

 ; A digit consists of 4 words of 32 bits each

@ -449,11 +453,11 @@ SymCryptFdefRawSquareAsmInnerLoopInit_Word1

    SQR_SINGLEADD_32    3

-  
+
    add     r2, r2, #16
    add     r4, r4, #16

-    adds    r3, r3, #1                  ; move one digit up  
+    adds    r3, r3, #1                  ; move one digit up
    bne     SymCryptFdefRawSquareAsmInnerLoopInit_Word0

    str     r11, [r4]                   ; Store the next word into the destination
@ -689,7 +693,7 @@ SymCryptFdefMontgomeryReduceAsmInner
    adds    r11, r11, r7                ; c + pSrc[nWords] + hc
    adc     r8, r8, #0                  ; Add the carry if any
    str     r11, [r1], #4               ; pSrc[nWords] = c
-    
+
    adds    r12, r12, r6                ; c + pSrc[nWords+1]
    adc     r9, r9, #0                  ; Add the carry if any
    adds    r12, r12, r8                ; c + pSrc[nWords] + hc
@ -701,7 +705,7 @@ SymCryptFdefMontgomeryReduceAsmInner
    add     r2, r2, #8                  ; Move stored pSrc pointer two words up
    ldr     r0, [sp, #pMod]             ; Restore the pMod pointer
    mov     r1, r2                      ; Restore the pSrc pointer
-    
+
    bne     SymCryptFdefMontgomeryReduceAsmOuter

    ;
--- a/lib/arm64/fdef369_asm.asm
+++ b/lib/arm64/fdef369_asm.asm
@ -16,7 +16,11 @@
 #include "symcrypt_name_mangling.inc"
 #include "symcrypt_magic.inc"

+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM

 ; A digit consists of 3 words of 64 bits each

@ -213,7 +217,7 @@ SymCryptFdef369RawMulAsmLoopInner1
    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
    str     x12, [x4], #8               ; Store to destination
-    
+
    cbnz    x3, SymCryptFdef369RawMulAsmLoopInner1

    adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
--- a/lib/arm64/fdef_asm.asm
+++ b/lib/arm64/fdef_asm.asm
@ -10,7 +10,11 @@
 #include "symcrypt_name_mangling.inc"
 #include "symcrypt_magic.inc"

+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM

 ; A digit consists of 4 words of 64 bits each

--- a/lib/fdef_general.c
+++ b/lib/fdef_general.c
@ -517,11 +517,11 @@ SymCryptFdefIntSetValueUint64(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawSetValue(
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
-                                SIZE_T                  cbSrc,
-                                SYMCRYPT_NUMBER_FORMAT  format,
-    _Out_writes_(nWords)        PUINT32                 pDst,
-                                UINT32                  nDigits )
+    _In_reads_bytes_(cbSrc)                             PCBYTE                  pbSrc,
+                                                        SIZE_T                  cbSrc,
+                                                        SYMCRYPT_NUMBER_FORMAT  format,
+    _Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32                 pDst,
+                                                        UINT32                  nDigits )
 {
    SYMCRYPT_ERROR scError;
    UINT32  b;
@ -611,11 +611,11 @@ SymCryptFdefIntSetValue(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawGetValue(
-    _In_reads_(nWords)          PCUINT32                pSrc,
-                                UINT32                  nDigits,
-    _Out_writes_bytes_(cbBytes) PBYTE                   pbDst,
-                                SIZE_T                  cbDst,
-                                SYMCRYPT_NUMBER_FORMAT  format )
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32                pSrc,
+                                                        UINT32                  nDigits,
+    _Out_writes_bytes_(cbBytes)                         PBYTE                   pbDst,
+                                                        SIZE_T                  cbDst,
+                                                        SYMCRYPT_NUMBER_FORMAT  format )
 {
    SYMCRYPT_ERROR scError;
    UINT32  b;
--- a/lib/fdef_int.c
+++ b/lib/fdef_int.c
@ -722,11 +722,11 @@ SymCryptFdefIntSquare(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulC(
-    _In_reads_(nWords1)             PCUINT32    pSrc1,
-                                    UINT32      nDigits1,
-    _In_reads_(nWords2)             PCUINT32    pSrc2,
-                                    UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
+    _In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32)              PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32)              PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
 {
    UINT32 nWords1 = nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32;
    UINT32 nWords2 = nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32;
@ -778,9 +778,9 @@ SymCryptFdefRawMul(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquareC(
-    _In_reads_(nWords)              PCUINT32    pSrc,
-                                    UINT32      nDigits,
-    _Out_writes_(2*nWords)          PUINT32     pDst )
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc,
+                                                        UINT32      nDigits,
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst )
 {
    UINT32 nWords = nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32;

--- a/lib/fdef_mod.c
+++ b/lib/fdef_mod.c
@ -1223,7 +1223,7 @@ SymCryptFdefModMulMontgomery(
    SymCryptFdefMontgomeryReduce( pmMod, pTmp, &peDst->d.uint32[0] );
 }

-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModMulMontgomeryMulx(
@ -1283,7 +1283,7 @@ SymCryptFdefModSquareMontgomery(
 }


-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModSquareMontgomeryMulx(
@ -1356,70 +1356,12 @@ SymCryptFdefModInvMontgomery(
    return scError;
 }

-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64

 //=====================================
 // 256-bit Montgomery modulus code
 //

-VOID
-SYMCRYPT_CALL
-SymCryptFdefModAdd256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch )
-{
-    SYMCRYPT_ASYM_ALIGN BYTE    buf1[128];
-    SYMCRYPT_ASYM_ALIGN BYTE    buf2[128];
-    PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-    PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-
-    (VOID) peTmp1;
-    (VOID) peTmp2;
-
-    SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
-    SymCryptFdefModAddGeneric( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch );
-
-    if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
-    {
-        SymCryptFatal( 42 );
-    }
-
-    SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefModMulMontgomery256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch )
-{
-    SYMCRYPT_ASYM_ALIGN BYTE    buf1[128];
-    SYMCRYPT_ASYM_ALIGN BYTE    buf2[128];
-    PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-    PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-
-    (VOID) peTmp1;
-    (VOID) peTmp2;
-
-    SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
-    //SymCryptFdefModMulMontgomery( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); *** This doesn't produce the same result as it reduces a whole digit, not 256 bits
-
-    if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
-    {
-    //    SymCryptFatal( 42 );
-    }
-
-    SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
-}
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModSquareMontgomery256(
--- a/lib/i386/fdef_asm.cppasm
+++ b/lib/i386/fdef_asm.cppasm
@ -1,5 +1,6 @@
 ;
-;  fdef_asm.asm     Assembler code for fast arithmetic 
+;  fdef_asm.cppasm     Assembler code for fast arithmetic
+;  Requires C preprocessor to correctly include C_asm_shared.inc
 ;
 ; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
 ;
@ -11,9 +12,9 @@
 ;
 ; FPO documentation:
 ; The .FPO provides debugging information.
-; This stuff not well documented, 
+; This stuff not well documented,
 ; but here is the information I've gathered about the arguments to .FPO
-; 
+;
 ; In order:
 ; cdwLocals: Size of local variables, in DWords
 ; cdwParams: Size of parameters, in DWords. Given that this is all about
@ -23,7 +24,7 @@
 ;            prolog code with work for better performance. Most uses of
 ;            .FPO seem to set this value to 0 anyway, which is what we
 ;            will do.
-; cbRegs   : # registers saved in the prolog. 
+; cbRegs   : # registers saved in the prolog.
 ; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
 ; cbFrame  : Type of frame.
 ;            0 = FPO frame (no frame pointer)
@ -43,7 +44,7 @@ _TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
 include symcrypt_version.inc
 include symcrypt_magic.inc

-include C_asm_shared.inc
+#include "C_asm_shared.inc"

        PUBLIC  @SymCryptFdefRawAddAsm@16
        PUBLIC  @SymCryptFdefRawSubAsm@16
@ -60,7 +61,7 @@ BEFORE_PROC     MACRO
        ;
        DB      5 dup (0cch)
                ENDM
-        
+



@ -86,7 +87,7 @@ pDst            dd      ?
 nDigits         dd      ?

 SymCryptFdefRawAddAsmFrame ends
-        
+
        ; ecx = pSrc1
        ; edx = pSrc2

@ -129,7 +130,7 @@ SymCryptFdefRawAddAsmLoop:
        pop     edi
        pop     ebx
        ret     8
-        
+
@SymCryptFdefRawAddAsm@16 ENDP


@ -154,7 +155,7 @@ pDst            dd      ?
 nDigits         dd      ?

 SymCryptFdefRawSubAsmFrame ends
-        
+
        ; ecx = pSrc1
        ; edx = pSrc2

@ -197,7 +198,7 @@ SymCryptFdefRawSubAsmLoop:
        pop     edi
        pop     ebx
        ret     8
-        
+
@SymCryptFdefRawSubAsm@16 ENDP


@ -305,8 +306,8 @@ SymCryptFdefRawMulAsmFrame        ends
        ;   for each word in Src1:
        ;       Dst += Src2 * word
        ; Register assignments
-        ; 
-        ;   eax = tmp/lower half of mult 
+        ;
+        ;   eax = tmp/lower half of mult
        ;   ebx = multiplicant
        ;   ecx = loop counter, initialized to nDigits2
        ;   edx = upper half of mult
@ -315,7 +316,7 @@ SymCryptFdefRawMulAsmFrame        ends
        ;   ebp = carry
        ;
        ;   esp + pSrc1     running pointer into Src1
-        ;   esp + 
+        ;   esp +


        mov     edi,edi
@ -436,7 +437,7 @@ SymCryptFdefRawMulAsmLoop2:
        adc     edx, 0
        mov     [edi + 12], eax
        mov     ebp, edx
-        
+
        add     esi, 16
        add     edi, 16
        sub     ecx,1
@ -477,7 +478,7 @@ SymCryptFdefMontgomeryReduceAsmFrame struct 4, NONUNIQUE
        HighCarry       dd  ?
        pSrc            dd  ?
        pModValue       dd  ?
-        nWords          dd  ?           
+        nWords          dd  ?
        SaveEbp         dd  ?       ; # words still to process in outer loop
        SaveEsi         dd  ?
        SaveEdi         dd  ?
@ -513,13 +514,13 @@ SymCryptFdefMontgomeryReduceAsmFrame        ends
 SymCryptFdefMontgomeryReduceOuterLoop:
        ; eax = <undef>
        ; ebx = <undef>
-        ; ecx = <undef> 
+        ; ecx = <undef>
        ; edx = <undef>
        ; esi = start of mod value
        ; edi = pSrc + 4 * loop iteration count
        ; ebp = <undef>

-        ; compute multiplier for this outer loop iteration. 
+        ; compute multiplier for this outer loop iteration.
        mov     ebx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusMontgomeryInv64OffsetX86 ]
        imul    ebx, [edi]              ; word we want to zero out, ebx = multiplier for this inner loop

@ -529,7 +530,7 @@ SymCryptFdefMontgomeryReduceOuterLoop:
 SymCryptFdefMontgomeryReduceInnerLoop:
        ; eax = mul scratch
        ; ebx = multiplier
-        ; ecx = digit counter 
+        ; ecx = digit counter
        ; edx = mul scratch
        ; esi = running pointer to mod value
        ; edi = running pointer to input/scratch
@ -570,7 +571,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
        adc     edx, 0
        mov     [edi + 12], eax
        mov     ebp, edx
-        
+
        add     esi, 16
        add     edi, 16
        sub     ecx,1
@ -606,7 +607,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:

        mov     ecx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusNdigitsOffsetX86]    ; loop counter
        mov     edx, [esp + SymCryptFdefMontgomeryReduceAsmFrame.pDst];
-        
+
        ; ecx = nDigits

        ; Save some values for the copy loop
--- a/lib/i386/rc4asm.asm
+++ b/lib/i386/rc4asm.asm
@ -1,314 +0,0 @@
-;
-;     rc4asm.asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;       RC4 implementation in x86 assembler
-;       This is a new RC4 implementation for SymCrypt.
-;       It is NOT based on the existing one in RSA32.lib.
-;
-
-
-        TITLE   "RC4"
-        .586P
-
-_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
-        ASSUME  CS:_TEXT, DS:FLAT, SS:FLAT
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-;
-; Structure definition that mirrors the SYMCRYPT_RC4_STATE struct
-;
-        
-RC4_STATE struct
-        S               db      256 dup (?)
-        i               db      ?
-        j               db      ?
-
-        SYMCRYPT_MAGIC_FIELD
-        
-RC4_STATE ends
-
-        
-        PUBLIC  @SymCryptRc4InitAsm@12
-        PUBLIC  @SymCryptRc4CryptAsm@16
-
-
-BEFORE_PROC     MACRO
-        ;
-        ; Our current x86 compiler inserts 5 0xcc bytes before every function
-        ; and starts every function with a 2-byte NOP.
-        ; This supports hot-patching.
-        ;
-        DB      5 dup (0cch)
-                ENDM
-
-
-; The .FPO provides debugging information.
-; This stuff not well documented, 
-; but here is the information I've gathered about the arguments to .FPO
-; 
-; In order:
-; cdwLocals: Size of local variables, in DWords
-; cdwParams: Size of parameters, in DWords. Given that this is all about
-;            stack stuff, I'm assuming this is only about parameters passed
-;            on the stack.
-; cbProlog : Number of bytes in the prolog code. We have interleaved the
-;            prolog code with work for better performance. Most uses of
-;            .FPO seem to set this value to 0 anyway, which is what we
-;            will do.
-; cbRegs   : # registers saved in the prolog. 4 in our case
-; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
-; cbFrame  : Type of frame.
-;            0 = FPO frame (no frame pointer)
-;            1 = Trap frame (result of a CPU trap event)
-;            2 = TSS frame
-;
-; Having looked at various occurrences of .FPO in the Windows code it
-; seems to be used fairly sloppy, with lots of arguments left 0 even when
-; they probably shouldn't be according to the spec.
-;
-
-
-
-        BEFORE_PROC
-        
-@SymCryptRc4InitAsm@12   PROC
-;VOID
-;SYMCRYPT_CALL
-;SymCryptRc4InitAsm( 
-;    _Out_                   PSYMCRYPT_RC4_STATE pState,
-;    _In_reads_bytes_( cbKey )    PCBYTE              pbKey,
-;    _In_                    SIZE_T              cbKey );
-;
-; NOTE: Unlike the SymCryptRc4Init function 
-; this function does not check the cbKey validity, and does not return an error code.
-; Currently we don't have the error code values symbolically in the asm environment.
-; We use an inlined function to generate the errors instead, and call this function
-; only when there are no errors.
-;
-
-Rc4InitFrame struct  4, NONUNIQUE
-
-pbKey           dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-cbKey           dd      ?
-
-Rc4InitFrame ends
-
-        .FPO(5,1,0,4,0,0)
-
-        ; ecx = pState
-        ; edx = pKey
-        ; [esp + 4] = cbKey
-
-        ;
-        ; Set up stack frame, and initialize pbKey
-        ;
-        mov     edi,edi         ; 2-byte NOP for hot-patching
-        
-        push    ebx
-        push    ebp
-        push    esi
-        push    edi
-        push    edx
-
-        ;
-        ; Initialize S[i] = i
-        ;
-        lea     esi,[ecx + 100h]
-        mov     edi,ecx
-        
-        mov     eax,03020100h
-        mov     ebx,04040404h
-
-@@:
-        mov     [edi],eax
-        add     eax,ebx
-        mov     [edi+4],eax
-        add     eax,ebx
-        mov     [edi+8],eax
-        add     eax,ebx
-        mov     [edi+12],eax
-        add     eax,ebx
-        add     edi,16
-        cmp     edi,esi
-        jb      @B
-
-
-        mov     ebp,edx         
-        xor     ebx,ebx         ; j = 0
-        xor     esi,esi         ; i = 0 
-        mov     edi,[esp + Rc4InitFrame.cbKey]  
-        add     edi, edx        ; edi = pbKey + cbKey
-        
-SymCryptRc4InitMainLoop:
-        ; Registers:
-        ; eax = Tmp1
-        ; ebx = j
-        ; ecx = S
-        ; edx = Tmp2
-        ; esi = i       
-        ; edi = keyLimit        ; just beyond the key
-        ; ebp = pKey    ; pointer to current key byte
-
-        movzx   edx,byte ptr[ebp]       ; get key byte
-        add     ebx,edx                 ; j += key byte
-        movzx   eax,byte ptr[ecx + esi] ; get S[i]
-        add     ebx,eax                 ; j += S[i]
-        
-        and     ebx,0ffh
-
-        movzx   edx,byte ptr [ecx + ebx]; get S[j]
-        mov     byte ptr[ecx + ebx], al ; update S[j]
-        mov     byte ptr[ecx + esi], dl ; update S[i]
-
-        add     ebp,1                   ; increment key pointer modulo key length
-        cmp     ebp,edi
-        jb      @F
-        mov     ebp,[esp + Rc4InitFrame.pbKey]
-@@:
-
-        add     esi,1                   ; increment i
-        cmp     esi,100h
-        jb      SymCryptRc4InitMainLoop
-
-        mov     word ptr [ecx + RC4_STATE.i], 1 ; i = 1; j = 0
-
-        add     esp,4
-        pop     edi
-        pop     esi
-        pop     ebp
-        pop     ebx
-        ret     4
-
-        
-@SymCryptRc4InitAsm@12   ENDP
-
-
-
-
-        BEFORE_PROC
-
-@SymCryptRc4CryptAsm@16         PROC
-;VOID
-;SYMCRYPT_CALL
-;SymCryptRc4Crypt( 
-;    _Inout_                 PSYMCRYPT_RC4_STATE pState,
-;    _In_reads_bytes_( cbData )   PCBYTE              pbSrc,
-;    _Out_writes_bytes_( cbData )  PBYTE               pbDst,
-;    _In_                    SIZE_T              cbData )
-
-Rc4CryptFrame struct  4, NONUNIQUE
-pbEndDst        dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-pbDst           dd      ?
-cbData          dd      ?
-
-Rc4CryptFrame ends
-
-        .FPO(5,2,0,4,0,0)
-
-
-        mov     edi,edi
-
-        push    ebx
-        push    ebp
-        push    esi
-        push    edi
-        sub     esp,4
-
-        SYMCRYPT_CHECK_MAGIC    ecx, RC4_STATE
-
-        mov     eax,[esp + Rc4CryptFrame.cbData]
-        test    eax,eax
-        jz      Rc4CryptDoNothing
-
-        mov     ebp,[esp + Rc4CryptFrame.pbDst]
-        add     eax,ebp
-        mov     [esp + Rc4CryptFrame.pbEndDst], eax
-        
-        mov     edi, edx
-        movzx   edx,[ecx + RC4_STATE.i]
-        movzx   esi,[ecx + RC4_STATE.j]
-
-        ;
-        ; Further perf improvements are possible.
-        ; Instead of encrypting byte-by-byte, we can collect 4 bytes of the key
-        ; stream in a register, and then encrypt 4 bytes at a time.
-        ; This reduces the # memory operations we do per byte.
-        ; Ideally this is done with aligned operations, either
-        ; aligning to pbSrc, pbDst, or to i (which removes the need to increment i every time).
-        ; 
-
-@@:     
-        ; eax   Ti
-        ; ebx   Tj
-        ; ecx   S
-        ; edx   i
-        ; esi   j
-        ; edi   pSrc
-        ; ebp   pDst
-
-        movzx   eax, byte ptr[ecx + edx]        ; Ti = S[i]
-        
-        ;add    esi, eax
-        ;and    esi, 0ffh
-        lea     ebx, [esi + eax]
-        movzx   esi, bl                         ; j += Ti
-        
-        movzx   ebx, byte ptr[ecx + esi]        ; Tj = S[j]
-        mov     [ecx + edx], bl                 ; S[i] = Tj
-        mov     [ecx + esi], al                 ; S[j] = Ti
-        
-        ;add    eax,ebx
-        ;and    eax,0ffh
-        lea     eax,[eax + ebx]                 
-        movzx   eax,al                          ; Ti = Ti + Tj
-        
-        mov     al,[ecx + eax]                  ; Til = S[Ti]
-
-        ;add    edx, 1
-        ;and    0ffh
-        lea     edx,[edx + 1]
-        movzx   edx,dl                          ; i += 1
-        
-        xor     al,[edi]
-        add     edi,1
-        mov     [ebp],al
-        add     ebp, 1
-
-        cmp     ebp,[esp + Rc4CryptFrame.pbEndDst]
-        jb      @B
-
-        mov     eax, esi
-        mov     [ecx + RC4_STATE.i], dl
-        mov     [ecx + RC4_STATE.j], al
-
-Rc4CryptDoNothing:
-        
-        add     esp,4
-        pop     edi
-        pop     esi
-        pop     ebp
-        pop     ebx
-        ret     8
-
-
-@SymCryptRc4CryptAsm@16         ENDP
-        
-
-
-_TEXT   ENDS
-        
-        END
--- a/lib/i386/sha1asm.asm
+++ b/lib/i386/sha1asm.asm
@ -1,383 +0,0 @@
-;
-; Sha1Asm.Asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;
-
-;
-;   This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
-;   for the x86 processor architecture.
-;
-;   This implementation is derived from an older one by Scott Field and
-;   Dan Shumow. 
-;     
-;   This implementation is optimized for Intel Core and contemporary AMD CPUs.
-;   Optimizations for pre-P3 Intel CPUs has been removed.
-;
-
-
-        TITLE   sha1asm.asm
-        .486
-
-_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
-        ASSUME  CS:_TEXT, DS:FLAT, SS:FLAT
-
-        PUBLIC  @SymCryptSha1AppendBlocksAsm@12
-
-        ;
-        ; The four round constants used by SHA-1
-        ;
-        
-K0_19   EQU     05a827999H
-K20_39  EQU     06ed9eba1H
-K40_59  EQU     08f1bbcdcH
-K60_79  EQU     0ca62c1d6H
-
-        align   16
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptSha1AppendBlocks( _Inout_updates_( 5 )        PUINT32    H,
-;                            _In_reads_bytes_( cbData )    PCBYTE    pbData,
-;                                                    SIZE_T    cbData )
-;
-@SymCryptSha1AppendBlocksAsm@12    PROC
-
-;
-; To keep stack manipulatins simple we define a structure and use that for all accesses.
-;
-SymCryptSha1AppendBlocksFrame struct  4, NONUNIQUE
-
-Wbuf            dd      16 dup (?)
-Hptr            dd      ?
-pbData          dd      ?
-BlockCount      dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-CbData          dd      ?
-
-SymCryptSha1AppendBlocksFrame ends
-
-        ;
-        ; We use the W buffer extensively; this is a shorthand for the base address
-        ;
-W       equ     esp+SymCryptSha1AppendBlocksFrame.Wbuf
-
-        ;
-        ; The .FPO provides debugging information for stack frames that do not use
-        ; ebp as a base pointer.
-        ; This stuff not well documented, 
-        ; but here is the information I've gathered about the arguments to .FPO
-        ; 
-        ; In order:
-        ; cdwLocals: Size of local variables, in DWords
-        ; cdwParams: Size of parameters, in DWords. Given that this is all about
-        ;            stack stuff, I'm assuming this is only about parameters passed
-        ;            on the stack.
-        ; cbProlog : Number of bytes in the prolog code. We sometimes interleaved the
-        ;            prolog code with work for better performance. Most uses of
-        ;            .FPO seem to set this value to 0.
-        ;            The debugger seems to work if the prolog defined by this value
-        ;            contains all the stack adjustments.
-        ; cbRegs   : # registers saved in the prolog. 4 in our case
-        ; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
-        ; cbFrame  : Type of frame.
-        ;            0 = FPO frame (no frame pointer)
-        ;            1 = Trap frame (result of a CPU trap event)
-        ;            2 = TSS frame
-        ;
-        ; Having looked at various occurrences of .FPO in the Windows code it
-        ; seems to be used fairly sloppy, with lots of arguments left 0 even when
-        ; they probably shouldn't be according to the spec.
-        ;
-        .FPO(23,1,3,4,0,0)      ; 3 byte prolog (covers esp ajustment only)
-        
-        ; At this point:
-        ;       ecx = H
-        ;       edx = pbData
-        ;       [esp+4] = cbData
-
-        ;
-        ; Set up our stack frame and save non-volatile registers
-        ;
-        sub     esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEbp],ebp
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEdi],edi
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEsi],esi
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEbx],ebx
-
-        mov     [esp+SymCryptSha1AppendBlocksFrame.Hptr], ecx
-
-        ;
-        ; To allow macro re-ordering of our registers we use symbolic names
-        ; for the registers.
-        ; r0-r4 are the 5 state registers. x1 and x2 are extra scratch registers.
-        ; Note: some prolog code puts the right value in the right register and
-        ; has to be updated if this mapping is changed.
-        ;
-r0      EQU     eax
-r1      EQU     ebx
-r2      EQU     ecx
-r3      EQU     edx
-r4      EQU     esi
-x1      EQU     ebp
-x2      EQU     edi
-
-        ;
-        ; compute how many blocks we will process.
-        ; This code ensures that we never exceed the data buffer we were given,
-        ; although we silently round the cbData parameter down to the next
-        ; multiple of 64.
-        ; Do nothing if no blocks need to be processed.
-        ;
-        mov     eax,[esp+SymCryptSha1AppendBlocksFrame.CbData]
-        shr     eax,6
-        jz      SymCryptSha1AppendBlocksDone
-        mov     [esp+SymCryptSha1AppendBlocksFrame.BlockCount], eax
-        
-        ;
-        ; The data pointer goes into x1 = ebp at the start of our loop
-        ;
-        mov     ebp,edx
-
-        ;
-        ; Load the H state from [ecx], making sure we load the r2=ecx register
-        ; last.
-        ;
-        mov     r0,[ecx   ]
-        mov     r4,[ecx+ 4]
-        mov     r3,[ecx+ 8]
-        mov     r1,[ecx+16]
-        mov     r2,[ecx+12]
-        
-        
-SymCryptSha1AppendBlocksLoop:
-        ;
-        ; This is the main loop. We process 64 bytes in each iteration.
-        ; invariant: ebp = pbData
-        ;
-        
-        ;
-        ; Most of the code in the loop is generated through macros using parameters to
-        ; rename the registers.
-        ; The macros get the register number passed as parameter. They use
-        ; "r&<param>" to paste the number and the 'r' together and get the register
-        ; name we defined above.
-        ;
-        
-ROUND_CH_0_15   MACRO   round,ra,rb,rc,rd,re,x1,x2
-        ;
-        ; Code for round 0-15.
-        ; This code loads data from the data buffer & BSWAPs the data to get it into the
-        ; right form.
-        ;
-        ; Parameters:
-        ; round round number
-        ; ra    register number that contains the a value
-        ; rb    register number that contains the b value
-        ; rc    register number that contains the c value
-        ; rd    register number that contains the d value
-        ; re    register number that contains the e value
-        ; x1    pointer to the input data
-        ; x2    scratch register.
-        ; 
-        ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
-        ; We start with the d value as that is the oldest value and available the first
-        ;
-        ; See FIPS 180-2 for our symbolic notation.
-        ;
-        mov     x2,[x1+4*round]         ; Fetch word from message
-        bswap   x2                      ; x2 = Wt
-        add     r&re,x2                 ; re = e + Wt
-        mov     [W + 4*round],x2        ; Store in W buffer for future use
-        
-        mov     x2,r&ra                 ; x2 = a
-        rol     x2,5                    ; x2 = ROL(a,5)
-        add     r&re,x2                 ; re = e + Wt + ROL(a,5)
-        
-        mov     x2,r&rd                 ; x2 = d
-        xor     x2,r&rc                 ; x2 = (d ^ c)
-        and     x2,r&rb                 ; x2 = ((d ^ c) & b)
-        ror     r&rb,2                  ; rb = ROL( b, 30 )
-        xor     x2,r&rd                 ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        lea     r&re,[r&re+x2+K0_19]    ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-                
-        ENDM
-
-ROUND_CH        MACRO   round, ra, rb, rc, rd, re, x1, x2
-        ;
-        ; See ROUND_CH_0_15 for most parameters.
-        ; x1 and x2 are both scratch registers
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        xor     x1,r&rc                         ; x1 = (d ^ c)
-        and     x1,r&rb                         ; x1 = ((d ^ c) & b)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] 
-        xor     x1,r&rd                         ; x1 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        rol     x2,1                            ; x2 = Wt
-        mov     [W+4*((round-16) MOD 16)],x2    ; 
-        add     r&re,x2                         ; re = e + ROL(a,5) + Wt
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        lea     r&re,[r&re+x1+K0_19]            ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-        ENDM
-
-ROUND_PARITY    MACRO   round, ra, rb, rc, rd, re, x1, x2, K, store
-        ;
-        ; See ROUND_CH for most parameters
-        ; K is the round constant to use.
-        ; store is 1 if the Wt value should be stored, 0 otherwise
-        ;  (used to avoid stores in the last few rounds)
-        ;
-        ; The order of xorring the registers b, c, and d is driven by the data dependency graph.
-        ; We start with d (the oldest) and then do b to unblock the subsequent rotate
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        xor     x1,r&rb                         ; x1 = (d ^ b)
-        xor     x1,r&rc                         ; x1 = (d ^ b ^ c) = Parity(b,c,d)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     x2,1                            ; x2 = Wt
-        add     r&re,x1                         ; re = e + ROL(a,5) + Parity(b,c,d)
-        IF      store
-                mov     [W+4*((round-16) MOD 16)],x2    ; 
-        ENDIF
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        lea     r&re,[r&re+x2+K]                ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
-
-                ENDM
-
-ROUND_MAJ       MACRO   round, ra, rb, rc, rd, re, x1, x2
-        ;
-        ; See above for parameter explanation
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        or      x1,r&rc                         ; x1 = (d | c)
-        and     x1,r&rb                         ; x1 = ((d | c) & b)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] = Wt
-        rol     x2,1                            ; x2 = Wt
-        add     r&re,x2                         ; re = e + ROL(a,5) + Wt
-        mov     [W+4*((round-16) MOD 16)],x2    ; 
-
-        mov     x2,r&rc                         ; x2 = c
-        and     x2,r&rd                         ; x2 = (c & d)
-        or      x1,x2                           ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
-        
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        
-        lea     r&re,[r&re+x1+K40_59]           ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
-        ENDM
-
-        ;
-        ; With these macros we can now produce the actual code.
-        ; Note the use of the % operator which evaluates the expression and yields the result as text.
-        ; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
-        ; for each round.
-        ;
-        FOR     t, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-                ROUND_CH_0_15   t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-        
-        ;
-        ; For the rest of the computation we need the extra register, so we update the data pointer and store it.
-        ;
-        add     ebp,64
-        mov     [esp+SymCryptSha1AppendBlocksFrame.pbData], ebp
-        
-        FOR     t, <16, 17, 18, 19>
-                ROUND_CH        t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-        
-        FOR     t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K20_39, 1
-        ENDM
-
-        FOR     t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
-                ROUND_MAJ       t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-
-        FOR     t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 1
-        ENDM
-        
-        ;
-        ; The last three rounds do not need to store their Wt in the W buffer as that value will never get used.
-        ;
-        FOR     t, <77, 78, 79>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 0
-        ENDM
-        
-        ;
-        ; Now we update the state
-        ;
-        mov     x2,[esp+SymCryptSha1AppendBlocksFrame.Hptr]
-        add     r0,[x2   ]
-        add     r4,[x2+ 4]
-        add     r3,[x2+ 8]
-        add     r2,[x2+12]
-        add     r1,[x2+16]
-        
-        mov     [x2   ], r0
-        mov     [x2+ 4], r4
-        mov     [x2+ 8], r3
-        mov     [x2+12], r2
-        mov     [x2+16], r1
-
-        ;
-        ; See if we have more data to process, and load the data pointer register again
-        ;
-        dec     [esp+SymCryptSha1AppendBlocksFrame.BlockCount]
-        mov     ebp, [esp+SymCryptSha1AppendBlocksFrame.pbData]
-        jnz     SymCryptSha1AppendBlocksLoop
-        
-        ;
-        ; We're done processing the blocks. The result is already in the state, so all we have to do
-        ; is clean up.
-        ;
-        ; Wipe the W buffer
-        ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
-        ;
-        mov     ecx,8
-        xor     eax,eax
-@@:     dec     ecx
-        mov     [esp+8*ecx],eax
-        mov     [esp+8*ecx+4],eax
-        jnz     @B
-        
-SymCryptSha1AppendBlocksDone:   
-        ;
-        ; Restore non-volatile regisers & stackpointer
-        ;
-        mov     ebp,[esp+SymCryptSha1AppendBlocksFrame.SaveEbp]
-        mov     edi,[esp+SymCryptSha1AppendBlocksFrame.SaveEdi]
-        mov     esi,[esp+SymCryptSha1AppendBlocksFrame.SaveEsi]
-        mov     ebx,[esp+SymCryptSha1AppendBlocksFrame.SaveEbx]
-        add     esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
-        
-        ret     4
-        
-@SymCryptSha1AppendBlocksAsm@12    ENDP
-_TEXT           ENDS
-
-END
-
--- a/lib/libmain.c
+++ b/lib/libmain.c
@ -7,9 +7,7 @@

 #include "precomp.h"

-#define EQU =
 #include "C_asm_shared.inc"
-#undef EQU

 #include "buildInfo.h"

@ -34,16 +32,16 @@ SymCryptLibraryWasNotInitialized()

 #endif

-const CHAR * SymCryptBuildString = 
-        "v" SYMCRYPT_BUILD_INFO_VERSION 
-        "_" SYMCRYPT_BUILD_INFO_BRANCH 
+const CHAR * SymCryptBuildString =
+        "v" SYMCRYPT_BUILD_INFO_VERSION
+        "_" SYMCRYPT_BUILD_INFO_BRANCH
        "_" SYMCRYPT_BUILD_INFO_COMMIT
        "_" SYMCRYPT_BUILD_INFO_TIMESTAMP;

 VOID
 SYMCRYPT_CALL
 SymCryptInitEnvCommon( UINT32 version )
-// Returns TRUE if the initializatoin steps have to be performed.
+// Returns TRUE if the initialization steps have to be performed.
 {
    UINT32 tmp;

--- a/lib/linux/asmstubs.c
+++ b/lib/linux/asmstubs.c
@ -1,223 +0,0 @@
-//
-// asmstubs.c
-// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM
-//
-// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-//
-
-#include "../precomp.h"
-
-extern const SYMCRYPT_BLOCKCIPHER SymCryptAesBlockCipherNoOpt;
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesEncryptAsm(
-    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _In_reads_(SYMCRYPT_AES_BLOCK_SIZE)     PCBYTE                      pbSrc,
-    _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE)   PBYTE                       pbDst )
-{
-    SymCryptAesEncryptC( pExpandedKey, pbSrc, pbDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesDecryptAsm(
-    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _In_reads_(SYMCRYPT_AES_BLOCK_SIZE)     PCBYTE                      pbSrc,
-    _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE)   PBYTE                       pbDst )
-{
-    SymCryptAesDecryptC( pExpandedKey, pbSrc, pbDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCbcEncryptAsm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SymCryptCbcEncrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCbcDecryptAsm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SymCryptCbcDecrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCtrMsb64Asm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
-    SymCryptCtrMsb64( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
-{
-    volatile BYTE * p = (volatile BYTE *) pbData;
-    SIZE_T i;
-
-    for( i=0; i<cbData; i++ ){
-        p[i] = 0;
-    }
-
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyC(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyAsm(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask )
-{
-    SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulC(
-    _In_reads_(nWords1)             PCUINT32    pSrc1,
-                                    UINT32      nDigits1,
-    _In_reads_(nWords2)             PCUINT32    pSrc2,
-                                    UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2) PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulMulx(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst )
-{
-    SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulAsm(
-    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst )
-{
-    SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareC(
-    _In_reads_(nWords)              PCUINT32    pSrc,
-                                    UINT32      nDigits,
-    _Out_writes_(2*nWords)          PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareMulx(
-    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)         PCUINT32    pSrc,
-                                                            UINT32      nDigits,
-    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PUINT32     pDst )
-{
-    SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareAsm(
-    _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst )
-{
-    SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
-}
-
-VOID
-SymCryptFdefMontgomeryReduceC(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMontgomeryReduceMulx(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst )
-{
-    SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMontgomeryReduceAsm(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst )
-{
-    SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
-}
--- a/lib/makefile.inc
+++ b/lib/makefile.inc
@ -0,0 +1,12 @@
+.SUFFIXES: .symcryptasm .cppasm
+
+# We still have architecture-specific inference rules because otherwise we cannot do any architecture-specific preprocessing
+
+# Preprocess amd64 .symcryptasm into masm
+{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
+    ..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
+    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
+
+# Preprocess x86 .cppasm into masm
+{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
+    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
--- a/lib/sc_lib.h
+++ b/lib/sc_lib.h
@ -2212,11 +2212,11 @@ SymCryptFdefModElementToIntGeneric(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawSetValue(
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
-                                SIZE_T                  cbSrc,
-                                SYMCRYPT_NUMBER_FORMAT  format,
-    _Out_writes_(nWords)        PUINT32                 pDst,
-                                UINT32                  nWords );
+    _In_reads_bytes_(cbSrc)                             PCBYTE                  pbSrc,
+                                                        SIZE_T                  cbSrc,
+                                                        SYMCRYPT_NUMBER_FORMAT  format,
+    _Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32                 pDst,
+                                                        UINT32                  nDigits );

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
@ -2250,11 +2250,11 @@ SymCryptFdefModElementSetValueNegUint32(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawGetValue(
-    _In_reads_(nWords)          PCUINT32                pSrc,
-                                UINT32                  nWords,
-    _Out_writes_bytes_(cbBytes) PBYTE                   pbDst,
-                                SIZE_T                  cbDst,
-                                SYMCRYPT_NUMBER_FORMAT  format );
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32                pSrc,
+                                                        UINT32                  nDigits,
+    _Out_writes_bytes_(cbBytes)                         PBYTE                   pbDst,
+                                                        SIZE_T                  cbDst,
+                                                        SYMCRYPT_NUMBER_FORMAT  format );

 SYMCRYPT_ERROR
 SYMCRYPT_CALL
@ -2492,14 +2492,6 @@ SymCryptFdefRawSubUint32(
    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
                                                            UINT32      nDigits );

-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawMaskedAddUint32(
-    _Inout_updates_( nWords )   PUINT32     pAcc,
-    _In_reads_( nWords )        PCUINT32    pSrc,
-                                UINT32      mask,
-                                UINT32      nWords );
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModMulGeneric(
@ -2530,16 +2522,6 @@ SymCryptFdefModMulMontgomery256Asm(
    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                    SIZE_T                  cbScratch );

-VOID
-SYMCRYPT_CALL
-SymCryptFdefModMulMontgomery256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   pSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   pSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    pDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch );
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdef369ModMulMontgomery(
@ -2684,11 +2666,11 @@ SymCryptFdefRawMul(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulMulx(
-    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
@ -2696,7 +2678,7 @@ SymCryptFdefRawMulMulx1024(
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                        UINT32      nDigits,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
@ -2732,7 +2714,7 @@ UINT32
 SYMCRYPT_CALL
 SymCryptFdefRawIsEqualUint32(
    _In_    PCUINT32        pSrc1,
-            UINT32          nWords,
+            UINT32          nDigits,
    _In_    UINT32          u32Src2 );

 UINT32
@ -2909,27 +2891,27 @@ SymCryptFdef369MaskedCopyAsm(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulAsm(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquareAsm(
-    _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
+    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdef369RawMulAsm(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
@ -2937,14 +2919,14 @@ SymCryptFdefRawMul512Asm(
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquare512Asm(
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
@ -2952,69 +2934,69 @@ SymCryptFdefRawMul1024Asm(
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquare1024Asm(
    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceAsm(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce256Asm(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce512Asm(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce1024Asm(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdef369MontgomeryReduce(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdef369MontgomeryReduceAsm(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceMulx(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceMulx1024(
    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
    _Out_                           PUINT32                 pDst );

 // Helper macro for checking for specific key validation flag using bits 4 and 5 in a flags variable
--- a/lib/sources
+++ b/lib/sources
@ -13,14 +13,29 @@ ARM64X_EC_ENABLED=1
 TARGETNAME = symcrypt
 TARGETTYPE=LIBRARY

-KM_LIBRARY = 1		# enable /kernel flag & epilogue metadata
-GUARD = 1		# enable CFG
+KM_LIBRARY = 1  # enable /kernel flag & epilogue metadata
+GUARD = 1       # enable CFG
 ENABLE_ASM_RETPOLINE = 1
 ENABLE_RETPOLINE_LINKER_WARNING = 1

 # Enable /Gy for all assembler code
 ASM_DEFINES=$(ASM_DEFINES) /Gy

+USE_MAKEFILE_INC = 1
+
+# Explicitly call out that we must preprocess symcryptasm files
+# Make the target paths be architecture specific to get nmake to pick the right inference rule
+NTTARGETFILE0=\
+!IF "$(_BUILDARCH)" == "amd64"
+    $(OBJ_PATH)\$(O)\..\amd64\fdef_asm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\wipe.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\aesasm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\fdef369_asm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\fdef_mulx.asm \
+!ELSEIF "$(_BUILDARCH)" == "x86"
+    $(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
+!ENDIF
+
 INCLUDES=  \
    ..\inc; \
    $(DS_INC_PATH)\crypto; \
@ -137,7 +152,6 @@ SOURCES= \
    scsTools.c \

 AMD64_SOURCES = \
-#    sha1asm.asm \
    wipe.asm \
    aesasm.asm \
    fdef_asm.asm \
@ -145,10 +159,8 @@ AMD64_SOURCES = \
    fdef_mulx.asm \

 I386_SOURCES = \
-#    sha1asm.asm \
    aesasm.asm \
    wipe.asm \
-#    rc4asm.asm \
    fdef_asm.asm \

 ARM_SOURCES = \
--- a/lib/symcryptasm_shared.cppasm
+++ b/lib/symcryptasm_shared.cppasm
@ -0,0 +1,36 @@
+//
+//  symcryptasm_shared.cppasm   Shared definitions used by the C preprocessor step in symcryptasm
+//  processing. See scripts/symcryptasm_processor.py for more details.
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#if defined(SYMCRYPT_MASM)
+
+#if defined(SYMCRYPT_CPU_AMD64)
+include ksamd64.inc
+#endif
+
+#include "C_asm_shared.inc"
+
+#define FILE_END()                      END
+#define ALIGN(__alignment)              align __alignment
+#define GET_SYMBOL_ADDRESS(__symbol)    __symbol
+#define HEX(__constant)                 __constant##h
+
+#elif defined(SYMCRYPT_GAS)
+
+.intel_syntax noprefix
+
+#include "C_asm_shared.inc"
+
+#define FILE_END()
+#define ALIGN(__alignment)              .align __alignment
+#define GET_SYMBOL_ADDRESS(__symbol)    __symbol+rip
+#define HEX(__constant)                 0x##__constant
+
+#else
+
+#error Unknown target assembly
+
+#endif
--- a/scripts/symcryptasm_processor.py
+++ b/scripts/symcryptasm_processor.py
@ -0,0 +1,657 @@
+#!/usr/bin/env python3
+"""
+This script enables processing of symcryptasm files so that they can be assembled in a variety of
+environments without requiring forking or duplication of source files - symcryptasm files phrase
+assembly in an assembler and environment agnostic way.
+
+The current target assemblers are:
+    MASM and GAS
+The current target environments are:
+    amd64 Windows (using the Microsoft x64 calling convention), and
+    amd64 Linux (using the SystemV amd64 calling convention)
+
+Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
+rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
+this effort.
+
+The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
+symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
+The .cppasm files are further processed by the C preprocessor to do more simple stateless text
+substitutions, outputting a .asm file which can be assembled by thetarget assembler for the target
+environment.
+
+We have set up the intermediate generated files to be created in the output directories in both
+razzle and CMake builds.
+
+### symcryptasm syntax ###
+
+Different calling conventions pass arguments to functions in different registers, have differing
+numbers of volatile and non-volatile registers, and use the stack in different ways.
+
+We define our own register naming scheme which abstracts away the differences between calling
+conventions. The generalities of the naming scheme will be similar across target architectures, but
+refer to the Architecture specifics below for details. For the following general information we use
+the notation R<n> to denote registers in the symcryptasm register naming scheme.
+
+
+A leaf function (a function which does not call another function) begins with an invocation of the
+FUNCTION_START macro which currently takes 3 arguments:
+1) The function name
+    This must be the name that matches the corresponding declaration of the function
+2) The number of arguments (arg_count) that the function takes
+    These arguments will be accessible in some contiguous region of the symcrypt registers at the
+    start of the function
+        On amd64 this contiguous region is R1..R<arg_count>
+    Note: arg_count need not correspond to the exact number of argument in the function declaration
+    if the assembly does not use some tail of the arguments
+3) The number of registers (reg_count) that the function uses
+    These registers will be accessible as R0..R<reg_count-1>
+
+A leaf function ends with the FUNCTION_END macro, which also takes the function name
+    (a FUNCTION_END macro's function name must match the preceding FUNCTION_START's name)
+
+At the function start a prologue is generated which arranges the arguments appropriately in
+registers, and saves non-volatile registers that have been requested to be used.
+At the function end an epilogue is generated with restores the non-volatile registers and returns.
+
+
+A nested function (a function which does call another function) is specified similarly, only using
+NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and align
+the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
+
+
+A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
+number of macros argument names. It ends with MACRO_END.
+
+### Architecture specifics ###
+
+### amd64 ###
+We allow up to 15 registers to be addressed, with the names:
+Q0-Q15 (64-bit registers), W0-W15 (32-bit registers), H0-H15 (16-bit registers), and B0-B15 (8-bit
+registers)
+Xmm0-Xmm5 registers may be used directly in assembly too, as in both amd64 calling conventions we
+currently support, these registers are volatile so do not need any special handling
+
+On function entry we insert a prologue which ensures:
+Q0 is the result register (the return value of the function, and the low half of a multiplication)
+Q1-Q6 are the first 6 arguments passed to the function
+
+Additionally, there is a special case for functions using mul or mulx instructions, as these
+instructions make rdx a special register. Functions using these instructions may address Q0-Q14,
+and QH. As rdx is used to pass arguments, its value is moved to another register in the function
+prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
+    We currently do not support nested mul functions, as we have none of them.
+
+"""
+
+import re
+import types
+import logging
+
+class Register:
+    """A class to represent registers"""
+
+    def __init__(self, name64, name32, name16, name8):
+        self.name64 = name64
+        self.name32 = name32
+        self.name16 = name16
+        self.name8  = name8
+
+# amd64 registers
+REG_RAX = Register("rax",  "eax",   "ax",   "al")
+REG_RBX = Register("rbx",  "ebx",   "bx",   "bl")
+REG_RCX = Register("rcx",  "ecx",   "cx",   "cl")
+REG_RDX = Register("rdx",  "edx",   "dx",   "dl")
+REG_RSI = Register("rsi",  "esi",   "si",  "sil")
+REG_RDI = Register("rdi",  "edi",   "di",  "dil")
+REG_RSP = Register("rsp",  "esp",   "sp",  "spl")
+REG_RBP = Register("rbp",  "ebp",   "bp",  "bpl")
+REG_R8  = Register( "r8",  "r8d",  "r8w",  "r8b")
+REG_R9  = Register( "r9",  "r9d",  "r9w",  "r9b")
+REG_R10 = Register("r10", "r10d", "r10w", "r10b")
+REG_R11 = Register("r11", "r11d", "r11w", "r11b")
+REG_R12 = Register("r12", "r12d", "r12w", "r12b")
+REG_R13 = Register("r13", "r13d", "r13w", "r13b")
+REG_R14 = Register("r14", "r14d", "r14w", "r14b")
+REG_R15 = Register("r15", "r15d", "r15w", "r15b")
+
+class CallingConvention:
+    """A class to represent calling conventions"""
+
+    def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
+        self.name = name
+        self.architecture = architecture
+        self.mapping = mapping
+        self.argument_registers = argument_registers
+        self.volatile_registers = volatile_registers
+        self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
+        self.gen_epilogue_fn = types.MethodType(gen_epilogue_fn, self)
+        self.gen_get_memslot_offset_fn = types.MethodType(gen_get_memslot_offset_fn, self)
+
+
+def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
+    """Gets the register mapping used in functions requiring special rdx handling.
+
+    In amd64, when using mul and mulx, rdx is a special register.
+    rdx is also used for passing arguments in both Msft and System V calling conventions.
+    In asm functions that use mul or mulx, we will explicitly move the argument passed in
+    rdx to a different volatile register in the function prologue, and in the function body
+    we refer to rdx using (Q|D|W|B)H.
+    """
+    rdx_index = None
+    return_mapping = { 'H': REG_RDX }
+    for (index, register) in mapping.items():
+        if register == REG_RDX:
+            rdx_index = index
+            break
+    for (index, register) in mapping.items():
+        # preserve argument registers
+        if (index <= argument_registers) and (index != rdx_index):
+            return_mapping[index] = register
+        # replace rdx with the first non-argument register
+        if index == argument_registers+1:
+            return_mapping[rdx_index] = register
+        # shuffle all later registers down to fill the gap
+        if index > argument_registers+1:
+            return_mapping[index-1] = register
+    return return_mapping
+
+# Calling convention constants
+
+MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
+MAX_FUNCTION_REGISTER_COUNT = 15
+
+# Microsoft x64 calling convention
+MAPPING_AMD64_MSFT = {
+    0: REG_RAX, # Result register
+    1: REG_RCX, # Argument 1 / volatile
+    2: REG_RDX, # Argument 2 / volatile
+    3: REG_R8,  # Argument 3 / volatile
+    4: REG_R9,  # Argument 4 / volatile
+    5: REG_R10, # volatile
+    6: REG_R11, # volatile
+    7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
+    8: REG_RDI,
+    9: REG_RBP,
+    10:REG_RBX,
+    11:REG_R12,
+    12:REG_R13,
+    13:REG_R14,
+    14:REG_R15,
+    # currently not mapping rsp
+}
+
+def calc_amd64_shadow_space_allocation_size(self, reg_count):
+    # If we are a nested function, we must allocate 32B of shadow space on the stack, and ensure the
+    # stack pointer is aligned to 16B
+    # Before the prologue we have rsp % 16 == 8 - as the call pushed an 8B return address on an
+    # aligned stack
+    alignment = 8
+    # We then pushed some number of additional 8B registers onto the stack
+    if reg_count > self.volatile_registers:
+        alignment = (alignment + (8 * (self.volatile_registers - reg_count))) % 16
+    shadow_space_allocation_size = 32
+    if alignment == 8:
+        # possibly allocate 8 more bytes to align the stack to 16B
+        shadow_space_allocation_size += 8
+    return shadow_space_allocation_size
+
+def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=False):
+    prologue = "\n"
+    if reg_count > self.volatile_registers:
+        prologue += "rex_push_reg Q%s\n" % self.volatile_registers
+        for i in range(self.volatile_registers+1, reg_count):
+            prologue += "push_reg Q%s\n" % i
+        prologue += "\nEND_PROLOGUE\n\n"
+
+    shadow_space_allocation_size = 0
+
+    if nested:
+        shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        prologue += "sub rsp, %d // allocate shadow space and align stack\n\n" % shadow_space_allocation_size
+
+    prologue += mul_fixup
+
+    # put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
+    # stack_offset to get the 5th argument is:
+    # 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
+    stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
+    for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
+        prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
+        stack_offset += 8
+    return prologue
+
+def gen_prologue_amd64_msft_mul(self, arg_count, reg_count):
+    return gen_prologue_amd64_msft(self, arg_count, reg_count, "mov Q2, QH\n")
+
+def gen_prologue_amd64_msft_nested(self, arg_count, reg_count):
+    return gen_prologue_amd64_msft(self, arg_count, reg_count, "", nested=True)
+
+def gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=False):
+    epilogue = ""
+
+    if nested:
+        shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        epilogue += "add rsp, %d // deallocate shadow space and align stack\n\n" % shadow_space_allocation_size
+
+    if reg_count > self.volatile_registers:
+        epilogue += "BEGIN_EPILOGUE\n"
+        for i in reversed(range(self.volatile_registers, reg_count)):
+            epilogue += "pop Q%s\n" % i
+    epilogue += "ret\n"
+    return epilogue
+
+def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
+    return gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=True)
+
+def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
+    # only support 4 memory slots for now (in shadow space)
+    if(slot >= 4):
+        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        exit(1)
+    # 8B for return address + (8*#pushed registers in prologue)
+    stack_offset = 8 + (8*(reg_count-self.volatile_registers))
+    if nested:
+        stack_offset += calc_amd64_shadow_space_allocation_size(self, reg_count)
+    return "%d /*MEMSLOT%d*/" % (stack_offset+(8*slot), slot)
+
+def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
+    return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
+
+CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
+CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
+    "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
+    gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
+CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
+
+# AMD64 System V calling convention
+MAPPING_AMD64_SYSTEMV = {
+    0: REG_RAX, # Result register
+    1: REG_RDI, # Argument 1 / volatile
+    2: REG_RSI, # Argument 2 / volatile
+    3: REG_RDX, # Argument 3 / volatile
+    4: REG_RCX, # Argument 4 / volatile
+    5: REG_R8,  # Argument 5 / volatile
+    6: REG_R9,  # Argument 6 / volatile
+    7: REG_R10, # volatile
+    8: REG_R11, # volatile
+    9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
+    10:REG_RBP,
+    11:REG_R12,
+    12:REG_R13,
+    13:REG_R14,
+    14:REG_R15
+    # currently not mapping rsp
+}
+
+def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=False):
+    # push volatile registers onto the stack
+    prologue = "\n"
+    if reg_count > self.volatile_registers:
+        for i in range(self.volatile_registers, reg_count):
+            prologue += "push Q%s\n" % i
+
+    # If we are a nested function, we need to align the stack to 16B, and allocate space for up to 4
+    # memory slots not in the redzone. We can use the same logic as on the MSFT x64 side to allocate
+    # our own space for 32B of local variables (whereas on the MSFT side, we use this for allocating
+    # space for a function we are about to call)
+    if nested:
+        allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        prologue += "sub rsp, %d // allocate memslot space and align stack\n\n" % allocation_size
+
+    prologue += mul_fixup
+
+    # do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
+    # # put additional arguments into Q7-Qn
+    # # stack_offset to get the 7th argument is:
+    # # 8B for return address
+    # stack_offset = 8
+    # for i in range(self.argument_registers+1, arg_count+1):
+    #     prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
+    #     stack_offset += 8
+
+    return prologue
+
+def gen_prologue_amd64_systemv_mul(self, arg_count, reg_count):
+    return gen_prologue_amd64_systemv(self, arg_count, reg_count, "mov Q3, QH\n")
+
+def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count):
+    return gen_prologue_amd64_systemv(self, arg_count, reg_count, "", nested=True)
+
+def gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=False):
+    epilogue = ""
+
+    if nested:
+        allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        epilogue += "add rsp, %d // deallocate memslot space and align stack\n\n" % allocation_size
+
+    if reg_count > self.volatile_registers:
+        for i in reversed(range(self.volatile_registers, reg_count)):
+            epilogue += "pop Q%s\n" % i
+    epilogue += "ret\n"
+    return epilogue
+
+def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
+    return gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=True)
+
+def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
+    # only support 4 memory slots for now
+    if(slot >= 4):
+        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        exit(1)
+    # For leaf functions, use the top of the redzone below the stack pointer
+    offset = -8 * (slot+1)
+    if nested:
+        # For nested functions, use the 32B of memslot space above the stack pointer created in the prologue
+        offset = 8*slot
+    return "%d /*MEMSLOT%d*/" % (offset, slot)
+
+def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count):
+    return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
+
+CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
+CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
+    "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
+    gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
+CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
+
+
+def gen_function_start_defines(mapping, arg_count, reg_count):
+    defines = ""
+    for (index, reg) in mapping.items():
+        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
+            continue
+        defines += "#define Q%s %s\n" % (index, reg.name64)
+        defines += "#define D%s %s\n" % (index, reg.name32)
+        defines += "#define W%s %s\n" % (index, reg.name16)
+        defines += "#define B%s %s\n" % (index, reg.name8)
+    return defines
+
+def gen_function_end_defines(mapping, arg_count, reg_count):
+    undefs = ""
+    for (index, _) in mapping.items():
+        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
+            continue
+        undefs += "#undef Q%s\n" % (index)
+        undefs += "#undef D%s\n" % (index)
+        undefs += "#undef W%s\n" % (index)
+        undefs += "#undef B%s\n" % (index)
+    return undefs
+
+MASM_FRAMELESS_FUNCTION_ENTRY   = "LEAF_ENTRY %s, _TEXT\n"
+MASM_FRAMELESS_FUNCTION_END     = "LEAF_END %s, _TEXT\n"
+MASM_FRAME_FUNCTION_ENTRY       = "NESTED_ENTRY %s, _TEXT\n"
+MASM_FRAME_FUNCTION_END         = "NESTED_END %s, _TEXT\n"
+
+GAS_FUNCTION_ENTRY    = "%s: .global %s\n"
+GAS_FUNCTION_END      = ""
+
+def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
+    function_entry = None
+    if assembler == "masm":
+        # need to identify and mark up frame functions in masm
+        if nested or (reg_count > calling_convention.volatile_registers):
+            function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
+        else:
+            function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
+    elif assembler == "gas":
+        function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
+
+    prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
+    prologue += "%s" % (function_entry)
+    prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
+
+    return prologue
+
+def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
+    function_end = None
+    if assembler == "masm":
+        # need to identify and mark up frame functions in masm
+        if nested or (reg_count > calling_convention.volatile_registers):
+            function_end = MASM_FRAME_FUNCTION_END % (function_name)
+        else:
+            function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
+    elif assembler == "gas":
+        function_end = GAS_FUNCTION_END
+
+    epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
+    epilogue += "%s" % (function_end)
+    epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
+
+    return epilogue
+
+MASM_MACRO_START    = "%s MACRO %s\n"
+MASM_MACRO_END      = "ENDM\n"
+GAS_MACRO_START     = ".macro %s %s\n"
+GAS_MACRO_END       = ".endm\n"
+MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
+GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
+
+
+FUNCTION_START_PATTERN  = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
+FUNCTION_END_PATTERN    = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
+GET_MEMSLOT_PATTERN     = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
+ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
+MACRO_START_PATTERN     = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
+MACRO_END_PATTERN       = re.compile("\s*MACRO_END\s*\(\s*\)")
+
+class ProcessingStateMachine:
+    """A class to hold the state when processing a file and handle files line by line"""
+
+    def __init__(self, assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention):
+        self.assembler = assembler
+        self.normal_calling_convention = normal_calling_convention
+        self.mul_calling_convention = mul_calling_convention
+        self.nested_calling_convention = nested_calling_convention
+
+        self.function_start_match = None
+        self.function_start_line = 0
+        self.is_nested_function = None
+        self.is_mul_function = None
+        self.calling_convention = None
+        self.function_name = None
+        self.arg_count = None
+        self.reg_count = None
+
+        self.macro_start_match = None
+        self.macro_name = None
+        self.macro_args = None
+
+    def process_line(self, line, line_num):
+        if self.function_start_match == None and self.macro_start_match == None:
+            return self.process_normal_line(line, line_num)
+        elif self.function_start_match != None:
+            return self.process_function_line(line, line_num)
+        elif self.macro_start_match != None:
+            return self.process_macro_line(line, line_num)
+        else:
+            logging.error("Whoops, something is broken with the state machine (failed at line %d)" % line_num)
+            exit(1)
+
+    def process_normal_line(self, line, line_num):
+        # Not currently in a function or macro
+        match = FUNCTION_START_PATTERN.match(line)
+        if (match):
+            return self.process_start_function(match, line, line_num)
+
+        match = MACRO_START_PATTERN.match(line)
+        if (match):
+            return self.process_start_macro(match, line, line_num)
+
+        # Not starting a function or a macro
+        return line
+
+    def process_start_function(self, match, line, line_num):
+        # Entering a new function
+        self.function_start_match = match
+        self.function_start_line = line_num
+        self.is_nested_function = (match.group(1) == "NESTED_")
+        self.is_mul_function = (match.group(2) == "MUL_")
+        self.function_name = match.groups()[-3]
+        self.arg_count = int(match.groups()[-2])
+        self.reg_count = int(match.groups()[-1])
+
+        if self.is_nested_function and self.is_mul_function:
+            logging.error(
+                "Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
+                "%s (line %d)"
+                % (line, line_num))
+            exit(1)
+        if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
+            logging.error(
+                "Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
+                "%s (line %d)"
+                % (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
+            exit(1)
+        if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
+            logging.error(
+                "Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
+                "%s (line %d)"
+                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
+            exit(1)
+        if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
+            logging.error(
+                "Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
+                "%s (line %d)"
+                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
+            exit(1)
+
+        logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
+
+        if self.is_nested_function:
+            self.calling_convention = self.nested_calling_convention
+        elif self.is_mul_function:
+            self.calling_convention = self.mul_calling_convention
+        else:
+            self.calling_convention = self.normal_calling_convention
+
+        return generate_prologue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
+
+    def process_start_macro(self, match, line, line_num):
+        self.macro_start_match = match
+        self.macro_name = match.group(1)
+        self.macro_args = [ x.strip() for x in match.group(2).split(",") ]
+
+        logging.info("%d: macro start %s, %s" % (line_num, self.macro_name, self.macro_args))
+
+        if self.assembler == "masm":
+            return MASM_MACRO_START % (self.macro_name, match.group(2))
+        elif self.assembler == "gas":
+            return GAS_MACRO_START % (self.macro_name, match.group(2))
+
+    def process_function_line(self, line, line_num):
+        # Currently in a function
+
+        match = ALTERNATE_ENTRY_PATTERN.match(line)
+        if (match):
+            if self.assembler == "masm":
+                return MASM_ALTERNATE_ENTRY % match.group(1)
+            elif self.assembler == "gas":
+                return GAS_ALTERNATE_ENTRY % (match.group(1), match.group(1))
+
+        match = FUNCTION_END_PATTERN.match(line)
+        if (match):
+            # Check the end function has same prefix as previous start function
+            if  (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
+                (self.is_mul_function ^ (match.group(2) == "MUL_")):
+                logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                    % (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
+                exit(1)
+            # Check the end function pattern has the same label as the previous start function pattern
+            if self.function_name != match.groups()[-1]:
+                logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                    % (self.function_name, self.function_start_line, match.groups()[-1], line_num))
+                exit(1)
+
+            epilogue = generate_epilogue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
+
+            logging.info("%d: function end %s" % (line_num, self.function_name))
+
+            self.function_start_match = None
+            self.function_start_line = 0
+            self.is_nested_function = None
+            self.is_mul_function = None
+            self.calling_convention = None
+            self.function_name = None
+            self.arg_count = None
+            self.reg_count = None
+
+            return epilogue
+
+        # replace any GET_MEMSLOT_OFFSET macros in line
+        match = GET_MEMSLOT_PATTERN.search(line)
+        while(match):
+            slot = int(match.group(1))
+            replacement = self.calling_convention.gen_get_memslot_offset_fn(slot, self.arg_count, self.reg_count)
+            line = GET_MEMSLOT_PATTERN.sub(replacement, line)
+            match = GET_MEMSLOT_PATTERN.search(line)
+
+            logging.info("%d: memslot macro %d" % (line_num, slot))
+
+        # Not modifying the line any further
+        return line
+
+    def process_macro_line(self, line, line_num):
+        # Currently in a macro
+        match = MACRO_END_PATTERN.match(line)
+        if (match):
+            logging.info("%d: macro end %s" % (line_num, self.macro_name))
+
+            self.macro_start_match = None
+            self.macro_name = None
+            self.macro_args = None
+
+            if self.assembler == "masm":
+                return MASM_MACRO_END
+            elif self.assembler == "gas":
+                return GAS_MACRO_END
+
+        if self.assembler == "gas":
+            # In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
+            for arg in self.macro_args:
+                line = re.sub(arg, r"\\%s" % arg, line)
+
+        # Not modifying the line any further
+        return line
+
+def process_file(target, infilename, outfilename):
+    assembler = None
+    if target == "masm":
+        assembler = "masm"
+        normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
+        mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
+        nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
+    elif target == "gas":
+        assembler = "gas"
+        normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
+        mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
+        nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
+
+    # iterate through file line by line in one pass
+    file_processing_state = ProcessingStateMachine(
+        assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention)
+
+    with open(infilename) as infile:
+        with open(outfilename, "w") as outfile:
+            for line_num, line in enumerate(infile):
+                processed_line = file_processing_state.process_line(line, line_num)
+                outfile.write(processed_line)
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
+    parser.add_argument('target', type=str, help='Target that we want to preprocess for')
+    parser.add_argument('inputfile', type=str, help='Path to input file')
+    parser.add_argument('outputfile', type=str, help='Path to output file')
+
+    args = parser.parse_args()
+    process_file(args.target, args.inputfile, args.outputfile)
--- a/unittest/lib/main.cpp
+++ b/unittest/lib/main.cpp
@ -7,7 +7,6 @@

 #include "precomp.h"

-#define EQU =
 #include "C_asm_shared.inc"

 VOID