diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69aa0e6..8882479 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,9 +28,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib/${CMAKE_SYSTEM_PROCES
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/exe/${CMAKE_SYSTEM_PROCESSOR}/${SYMCRYPT_TARGET_ENV})
 
 if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
-    # Set DBG=1 and enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in
-    # the toolchain file
-    add_compile_options(-DDBG=1)
+    # Enable ASM_MASM. Annoyingly, this has to be done in the main CMake file rather than in the
+    # toolchain file
     enable_language(ASM_MASM)
     add_compile_options(/MP)
     # Remove /RTC1, incompatible of /Ox
@@ -43,16 +42,23 @@ if(WIN32 AND SYMCRYPT_TARGET_ENV MATCHES "WindowsUserMode")
     string( REPLACE "/Od" "" CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
     string( REPLACE "/Od" "" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
     string( REPLACE "/Od" "" CMAKE_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-    
-    IF(CMAKE_BUILD_TYPE MATCHES Release)
-        message("Release mode")
+
+    if(CMAKE_BUILD_TYPE MATCHES Release)
         add_compile_options(/Oxs)
-    ENDIF(CMAKE_BUILD_TYPE MATCHES Release)
+    endif()
 elseif(NOT WIN32)
     enable_language(ASM)
     add_compile_options(-Wno-deprecated-declarations -Wno-deprecated)
     add_compile_options(-g)
     add_compile_options(-Wno-multichar)
+    add_compile_options(-fPIC)
+endif()
+
+if(CMAKE_BUILD_TYPE MATCHES Release)
+    message("Release mode")
+else()
+    message("Debug mode")
+    add_compile_options(-DDBG=1)
 endif()
 
 include_directories(inc)
diff --git a/README.md b/README.md
index 65fe375..b14c0f6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Introduction 
+# Introduction
 SymCrypt is the core cryptographic function library currently used by Windows.
 
 ## History
@@ -30,20 +30,23 @@ or gcc 7.4.0 on Linux. Note that CMake ships with Visual Studio 2019.
 4. Configure CMake compilation:
     * For 32-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-x86.cmake -A Win32`
     * For 64-bit Windows targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/windows-amd64.cmake`
-    * For Linux (or Windows with no CPU optimizations): `cmake ..`
+    * For 64-bit Linux targets: `cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake-toolchain/linux-amd64.cmake`
+    * For no CPU optimizations: `cmake ..`
+    * Optionally, for a release build, specify `-DCMAKE_BUILD_TYPE=Release`
 5. `cmake --build .`
+    * Optionally specify -jN where N is the number of processes you wish to spawn for the build
 
 If compilation succeeds, the output will be put in the `exe` subdirectory relative to where compilation occurred
 (i.e. `bin/exe` if you followed the instructions above).
 
 The SymCrypt unit test is in the `unittest` directory. It runs extensive functional tests on the SymCrypt
 library. On Windows it also compares results against on other implementations such as the Windows APIs CNG
-and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides 
+and CAPI, and the older crypto libraries rsa32 and msbignum, if they are available. It also provides
 detailed performance information.
 
 # Security Bugs
 If you believe you have found a problem that affects the security of this code, please do **NOT** create an issue
-or pull request, but instead email your comments to secure@microsoft.com. 
+or pull request, but instead email your comments to secure@microsoft.com.
 
 # Contribute
 We love to receive comments and suggestions. Unfortunately we cannot accept external code contributions at this time.
diff --git a/cmake-toolchain/linux-amd64.cmake b/cmake-toolchain/linux-amd64.cmake
index 4af573d..6e4e3a1 100644
--- a/cmake-toolchain/linux-amd64.cmake
+++ b/cmake-toolchain/linux-amd64.cmake
@@ -10,7 +10,6 @@ set(SYMCRYPT_TARGET_ENV Linux)
 
 # Define _AMD64_ to set up the correct SymCrypt macros, e.g. SYMCRYPT_CPU_AMD64
 add_compile_options(-D_AMD64_)
-add_compile_options(-DDBG)
 add_compile_options(-O3)
 
 # Enable a baseline of features for the compiler to support everywhere
diff --git a/inc/C_asm_shared.inc b/inc/C_asm_shared.inc
index 0057c3e..7a7e326 100644
--- a/inc/C_asm_shared.inc
+++ b/inc/C_asm_shared.inc
@@ -1,70 +1,43 @@
-;/*
-; C_asm_shared.inc  file to synchronize C and Asm information
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+/*
+ C_asm_shared.inc  file to synchronize C and Asm information
+ Copyright (c) Microsoft Corporation. Licensed under the MIT license.
 
-; This is a file that compiles both in C and ASM to define values in a way that is guaranteed to be the same on both sides.
-; We use this to define the structure offsets that the ASM code uses.
-; By having equivalent C constants we can add checks to the C code to ensure they are correct.
-;
-; This is an ugly hack, but it works :-)
-; 
-; Due to the fact that the ARM assemblers use the C precompiler
-; the C files have to redefine EQU to nothing before including this file.
-; */
+ This is a file that is included in both C and ASM such that the values are the same on both sides.
+ We use the C preprocessor to set ASM constants, as we already need to use the C preprocessor for
+ symcryptasm processing (see scripts/symcryptasm_processor.py).
+ We use this to define the structure offsets that the ASM code uses.
+ By having equivalent C constants we can add checks to the C code to ensure they are correct.
 
+*/
 
-;const SIZE_T
-SymCryptModulusNdigitsOffsetAmd64           EQU     4;
+#if defined(SYMCRYPT_MASM)
+#define SET(_variable, _value) _variable EQU _value
+#elif defined(SYMCRYPT_GAS)
+#define SET(_variable, _value) .set _variable, _value
+#else // assume C
+#define SET(_variable, _value) const SIZE_T _variable = _value;
+#endif
 
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetAmd64   EQU     32;
+SET(SymCryptModulusNdigitsOffsetAmd64,          4);
+SET(SymCryptModulusMontgomeryInv64OffsetAmd64,  32);
+SET(SymCryptModulusValueOffsetAmd64,            128);
 
-; const SIZE_T
-SymCryptModulusValueOffsetAmd64             EQU     128;
+SET(SymCryptModulusNdigitsOffsetX86,            4);
+SET(SymCryptModulusMontgomeryInv64OffsetX86,    24);
+SET(SymCryptModulusValueOffsetX86,              96);
 
+SET(SymCryptModulusNdigitsOffsetArm64,          4);
+SET(SymCryptModulusMontgomeryInv64OffsetArm64,  32);
+SET(SymCryptModulusValueOffsetArm64,            128);
 
+SET(SymCryptModulusNdigitsOffsetArm,            4);
+SET(SymCryptModulusMontgomeryInv64OffsetArm,    24);
+SET(SymCryptModulusValueOffsetArm,              96);
 
-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetX86             EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetX86     EQU     24;
-
-; const SIZE_T
-SymCryptModulusValueOffsetX86               EQU     96;
-
-
-
-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetArm64           EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetArm64   EQU     32;
-
-; const SIZE_T
-SymCryptModulusValueOffsetArm64             EQU     128;
-
-
-
-
-;const SIZE_T
-SymCryptModulusNdigitsOffsetArm             EQU     4;
-
-; const SIZE_T
-SymCryptModulusMontgomeryInv64OffsetArm     EQU     24;
-
-; const SIZE_T
-SymCryptModulusValueOffsetArm               EQU     96;
-
-
-
-
-; /*
-    IF 0
-; */
-#undef EQU
+#if !defined(SYMCRYPT_MASM) && !defined(SYMCRYPT_GAS)
+// Preserve the definition of SET for use in symcryptasm processing
+#undef SET
+#endif
 
 #if SYMCRYPT_CPU_AMD64
 #define SYMCRYPT_CHECK_ASM_OFFSETS \
@@ -89,14 +62,9 @@ SymCryptModulusValueOffsetArm               EQU     96;
     SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusNdigitsOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, nDigits ) );\
     SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusMontgomeryInv64OffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, tm.montgomery.inv64 ));\
     SYMCRYPT_CHECK_ASM_OFFSET( SymCryptModulusValueOffsetArm, SYMCRYPT_FIELD_OFFSET( SYMCRYPT_MODULUS, Divisor.Int.ti.fdef.uint32 ));\
-  
+
 #endif // CPU_*
 
 #if !defined( SYMCRYPT_CHECK_ASM_OFFSETS)
 #define SYMCRYPT_CHECK_ASM_OFFSETS
 #endif
-
-
-; /*
-    ENDIF
-; */
\ No newline at end of file
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 8617135..edd3586 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -96,42 +96,148 @@ set(SOURCES_COMMON
     IEEE802_11SaeCustom.c
   )
 
+function(process_cppasm filepath outformat archdefine)
+    get_filename_component(fileextension ${filepath} EXT)
+    if(NOT fileextension STREQUAL .cppasm)
+        message(FATAL_ERROR "cppasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
+    endif()
+    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
+        message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
+    endif()
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
+        message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
+    endif()
+    get_filename_component(rootpath ${filepath} DIRECTORY)
+    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
+    string(TOUPPER ${outformat} outformatupper)
+    string(TOUPPER ${archdefine} archdefineupper)
+    string(FIND ${rootpath} ${CMAKE_CURRENT_BINARY_DIR} findindex) # check whether input is in the output directory
+    if(findindex EQUAL -1) # input in the source directory
+        set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
+        set(output_pass2 ${CMAKE_CURRENT_BINARY_DIR}/${rootpath}/${filestem}-${outformat}.asm)
+    else() # input in the output directory
+        set(output_directory ${rootpath})
+        set(output_pass2 ${rootpath}/${filestem}.asm)
+    endif()
+
+    set(dbg_definition "")
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+        set(dbg_definition "-DDBG=1")
+    endif()
+
+    if(outformat STREQUAL gas)
+        # assume gas => GCC compatible C compiler
+        add_custom_command(
+            OUTPUT ${output_pass2}
+            COMMAND "${CMAKE_C_COMPILER}" -E -P -x c ${filepath} -o ${output_pass2}
+                -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
+                -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
+            MAIN_DEPENDENCY ${filepath}
+            DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
+            COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
+            VERBATIM)
+    elseif(outformat STREQUAL masm)
+        # assume masm => MSVC C compiler
+        add_custom_command(
+            OUTPUT ${output_pass2}
+            COMMAND "${CMAKE_C_COMPILER}" /EP /P /Fi${output_pass2} ${filepath}
+                -I${CMAKE_CURRENT_SOURCE_DIR} -I${CMAKE_CURRENT_SOURCE_DIR}/${rootpath} -I${CMAKE_SOURCE_DIR}/inc
+                -DSYMCRYPT_${outformatupper} -DSYMCRYPT_CPU_${archdefineupper} ${dbg_definition}
+            MAIN_DEPENDENCY ${filepath}
+            DEPENDS ${CMAKE_SOURCE_DIR}/inc/C_asm_shared.inc ${filepath} symcryptasm_shared.cppasm
+            COMMENT "C preprocessing ${filepath} to ${outformat} (${output_pass2})"
+            VERBATIM)
+    endif()
+endfunction()
+
+function(process_symcryptasm filepath outformat archdefine)
+    get_filename_component(fileextension ${filepath} EXT)
+    if(NOT fileextension STREQUAL .symcryptasm)
+        message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
+    endif()
+    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
+        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
+    endif()
+    get_filename_component(rootpath ${filepath} DIRECTORY)
+    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
+    set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
+    set(output_directory ${CMAKE_CURRENT_BINARY_DIR}/${rootpath})
+    set(output_cppasm ${output_directory}/${filestem}-${outformat}.cppasm)
+
+    add_custom_command(
+        OUTPUT ${output_cppasm}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
+        COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
+        MAIN_DEPENDENCY ${filepath}
+        DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
+        COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
+        VERBATIM)
+
+    process_cppasm(${output_cppasm} ${outformat} ${archdefine})
+endfunction()
+
 if(NOT WIN32)
     list(APPEND SOURCES_COMMON linux/intrinsics.c)
-    list(APPEND SOURCES_COMMON linux/asmstubs.c)
 endif()
 
 if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
     if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
+        process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
+        process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
+
         list(APPEND SOURCES_COMMON
-            amd64/aesasm.asm
-            amd64/fdef_asm.asm
-            amd64/fdef_mulx.asm
-            amd64/fdef369_asm.asm
-            amd64/sha1asm.asm
-            amd64/wipe.asm)
+            amd64/aesasm-masm.asm
+            amd64/fdef_asm-masm.asm
+            amd64/fdef369_asm-masm.asm
+            amd64/fdef_mulx-masm.asm
+            amd64/wipe-masm.asm)
         set_source_files_properties(
-            amd64/aesasm.asm
-            amd64/fdef_asm.asm
-            amd64/fdef_mulx.asm
-            amd64/fdef369_asm.asm
-            amd64/sha1asm.asm
-            amd64/wipe.asm
+            amd64/aesasm-masm.asm
+            amd64/fdef_asm-masm.asm
+            amd64/fdef369_asm-masm.asm
+            amd64/fdef_mulx-masm.asm
+            amd64/wipe-masm.asm
             PROPERTY LANGUAGE ASM_MASM)
+
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "X86")
+        process_cppasm(i386/fdef_asm.cppasm masm x86)
+
         list(APPEND SOURCES_COMMON
             i386/aesasm.asm
-            i386/fdef_asm.asm
-            i386/rc4asm.asm
-            i386/sha1asm.asm
+            i386/fdef_asm-masm.asm
             i386/wipe.asm)
         set_source_files_properties(
             i386/aesasm.asm
-            i386/fdef_asm.asm
-            i386/rc4asm.asm
-            i386/sha1asm.asm
+            i386/fdef_asm-masm.asm
             i386/wipe.asm
             PROPERTY LANGUAGE ASM_MASM)
+        set_source_files_properties(
+            i386/fdef_asm-masm.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/i386)
+    endif()
+else()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
+        process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
+        process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
+
+        list(APPEND SOURCES_COMMON
+            amd64/aesasm-gas.asm
+            amd64/fdef_asm-gas.asm
+            amd64/fdef369_asm-gas.asm
+            amd64/fdef_mulx-gas.asm
+            amd64/wipe-gas.asm)
+        set_source_files_properties(
+            amd64/aesasm-gas.asm
+            amd64/fdef_asm-gas.asm
+            amd64/fdef369_asm-gas.asm
+            amd64/fdef_mulx-gas.asm
+            amd64/wipe-gas.asm
+            PROPERTY LANGUAGE ASM)
     endif()
 endif()
 
diff --git a/lib/a_dispatch.c b/lib/a_dispatch.c
index de4448e..a5db79d 100644
--- a/lib/a_dispatch.c
+++ b/lib/a_dispatch.c
@@ -22,7 +22,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
     SYMCRYPT_MOD_FUNCTIONS_FDEF_GENERIC,                // Handles any type of modulus
     SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY,             // Montgomery, only for odd parity-public moduli
 
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 
     SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY,          // optimized for 384 and 576-bit moduli
     SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY256,          // Special faster code for 256-bit Montgomery moduli
@@ -55,12 +55,12 @@ const UINT32 g_SymCryptModFnsMask = sizeof( g_SymCryptModFns ) - sizeof( g_SymCr
 //
 // Tweaking the selection & function tables allows different tradeoffs of performance vs codesize
 //
-SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] = 
+SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
 {
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
     // Mulx used for 257-512 and 577-... bits
     {('2M' << 16) + SymCryptModFntableMontgomery256,        0,                               256,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
-    {('xM' << 16) + SymCryptModFntableMontgomeryMulx,       SYMCRYPT_CPU_FEATURES_FOR_MULX,  512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY }, 
+    {('xM' << 16) + SymCryptModFntableMontgomeryMulx,       SYMCRYPT_CPU_FEATURES_FOR_MULX,  512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
     {('9M' << 16) + SymCryptModFntable369Montgomery,        0,                               384,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
     {('5M' << 16) + SymCryptModFntableMontgomery512,        0,                               512,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
     {('9M' << 16) + SymCryptModFntable369Montgomery,        0,                               576,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
@@ -118,9 +118,9 @@ SymCryptSizeofIntFromDigits( UINT32 nDigits )
 
 PSYMCRYPT_INT
 SYMCRYPT_CALL
-SymCryptIntCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptIntCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                     UINT32  nDigits )
 {
     return SymCryptFdefIntCreate( pbBuffer, cbBuffer, nDigits );
@@ -138,8 +138,8 @@ SymCryptIntWipe( _Out_ PSYMCRYPT_INT piDst )
 
 VOID
 SYMCRYPT_CALL
-SymCryptIntCopy( 
-    _In_    PCSYMCRYPT_INT  piSrc, 
+SymCryptIntCopy(
+    _In_    PCSYMCRYPT_INT  piSrc,
     _Out_   PSYMCRYPT_INT   piDst )
 {
     SymCryptFdefIntCopy( piSrc, piDst );
@@ -191,8 +191,8 @@ SymCryptIntDigitsizeOfObject( _In_ PCSYMCRYPT_INT piSrc )
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntCopyMixedSize( 
-    _In_    PCSYMCRYPT_INT  piSrc, 
+SymCryptIntCopyMixedSize(
+    _In_    PCSYMCRYPT_INT  piSrc,
     _Out_   PSYMCRYPT_INT   piDst )
 {
     return SymCryptFdefIntCopyMixedSize( piSrc, piDst );
@@ -207,8 +207,8 @@ SymCryptIntBitsizeOfValue( _In_ PCSYMCRYPT_INT piSrc )
 
 VOID
 SYMCRYPT_CALL
-SymCryptIntSetValueUint32( 
-            UINT32          u32Src, 
+SymCryptIntSetValueUint32(
+            UINT32          u32Src,
     _Out_   PSYMCRYPT_INT   piDst )
 {
     SymCryptFdefIntSetValueUint32( u32Src, piDst );
@@ -216,8 +216,8 @@ SymCryptIntSetValueUint32(
 
 VOID
 SYMCRYPT_CALL
-SymCryptIntSetValueUint64( 
-            UINT64          u64Src, 
+SymCryptIntSetValueUint64(
+            UINT64          u64Src,
     _Out_   PSYMCRYPT_INT   piDst )
 {
     SymCryptFdefIntSetValueUint64( u64Src, piDst );
@@ -225,10 +225,10 @@ SymCryptIntSetValueUint64(
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntSetValue( 
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc, 
-                                SIZE_T                  cbSrc, 
-                                SYMCRYPT_NUMBER_FORMAT  format, 
+SymCryptIntSetValue(
+    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
+                                SIZE_T                  cbSrc,
+                                SYMCRYPT_NUMBER_FORMAT  format,
     _Out_                       PSYMCRYPT_INT           piDst )
 {
     return SymCryptFdefIntSetValue( pbSrc, cbSrc, format, piDst );
@@ -236,10 +236,10 @@ SymCryptIntSetValue(
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptIntGetValue( 
-    _In_                        PCSYMCRYPT_INT          piSrc, 
-    _Out_writes_bytes_( cbDst ) PBYTE                   pbDst, 
-                                SIZE_T                  cbDst, 
+SymCryptIntGetValue(
+    _In_                        PCSYMCRYPT_INT          piSrc,
+    _Out_writes_bytes_( cbDst ) PBYTE                   pbDst,
+                                SIZE_T                  cbDst,
                                 SYMCRYPT_NUMBER_FORMAT  format )
 {
     return SymCryptFdefIntGetValue( piSrc, pbDst, cbDst, format );
@@ -496,9 +496,9 @@ SymCryptSizeofDivisorFromDigits( UINT32 nDigits )
 
 PSYMCRYPT_DIVISOR
 SYMCRYPT_CALL
-SymCryptDivisorCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptDivisorCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                     UINT32  nDigits )
 {
     return SymCryptFdefDivisorCreate( pbBuffer, cbBuffer, nDigits );
@@ -514,8 +514,8 @@ SymCryptDivisorWipe( _Out_ PSYMCRYPT_DIVISOR pdObj )
 }
 
 VOID
-SymCryptDivisorCopy( 
-    _In_    PCSYMCRYPT_DIVISOR  pdSrc, 
+SymCryptDivisorCopy(
+    _In_    PCSYMCRYPT_DIVISOR  pdSrc,
     _Out_   PSYMCRYPT_DIVISOR   pdDst )
 {
     SymCryptFdefDivisorCopy( pdSrc, pdDst );
@@ -585,9 +585,9 @@ SymCryptSizeofModulusFromDigits( UINT32 nDigits )
 
 PSYMCRYPT_MODULUS
 SYMCRYPT_CALL
-SymCryptModulusCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer, 
-                                    SIZE_T  cbBuffer, 
+SymCryptModulusCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE   pbBuffer,
+                                    SIZE_T  cbBuffer,
                                     UINT32  nDigits )
 {
     return SymCryptFdefModulusCreate( pbBuffer, cbBuffer, nDigits );
@@ -604,7 +604,7 @@ SymCryptModulusWipe( _Out_ PSYMCRYPT_MODULUS pmObj )
 
 VOID
 SymCryptModulusCopy(
-    _In_    PCSYMCRYPT_MODULUS  pmSrc, 
+    _In_    PCSYMCRYPT_MODULUS  pmSrc,
     _Out_   PSYMCRYPT_MODULUS   pmDst )
 {
     SymCryptFdefModulusCopy( pmSrc, pmDst );
@@ -626,8 +626,8 @@ SymCryptModElementAllocate( _In_ PCSYMCRYPT_MODULUS pmMod )
 
 VOID
 SYMCRYPT_CALL
-SymCryptModElementFree( 
-    _In_    PCSYMCRYPT_MODULUS      pmMod,      
+SymCryptModElementFree(
+    _In_    PCSYMCRYPT_MODULUS      pmMod,
     _Out_   PSYMCRYPT_MODELEMENT    peObj )
 {
     SymCryptFdefModElementFree( pmMod, peObj );
@@ -642,9 +642,9 @@ SymCryptSizeofModElementFromModulus( PCSYMCRYPT_MODULUS pmMod )
 
 PSYMCRYPT_MODELEMENT
 SYMCRYPT_CALL
-SymCryptModElementCreate( 
-    _Out_writes_bytes_( cbBuffer )  PBYTE               pbBuffer, 
-                                    SIZE_T              cbBuffer, 
+SymCryptModElementCreate(
+    _Out_writes_bytes_( cbBuffer )  PBYTE               pbBuffer,
+                                    SIZE_T              cbBuffer,
                                     PCSYMCRYPT_MODULUS   pmMod )
 {
     return SymCryptFdefModElementCreate( pbBuffer, cbBuffer, pmMod );
@@ -660,9 +660,9 @@ SymCryptModElementWipe(
 }
 
 VOID
-SymCryptModElementCopy( 
+SymCryptModElementCopy(
     _In_    PCSYMCRYPT_MODULUS      pmMod,
-    _In_    PCSYMCRYPT_MODELEMENT   peSrc, 
+    _In_    PCSYMCRYPT_MODELEMENT   peSrc,
     _Out_   PSYMCRYPT_MODELEMENT    peDst )
 {
     SymCryptFdefModElementCopy( pmMod, peSrc, peDst );
@@ -671,7 +671,7 @@ SymCryptModElementCopy(
 VOID
 SymCryptModElementMaskedCopy(
     _In_    PCSYMCRYPT_MODULUS      pmMod,
-    _In_    PCSYMCRYPT_MODELEMENT   peSrc, 
+    _In_    PCSYMCRYPT_MODELEMENT   peSrc,
     _Out_   PSYMCRYPT_MODELEMENT    peDst,
             UINT32                  mask )
 {
@@ -753,7 +753,7 @@ SymCryptModElementToInt(
     PCUINT32 pData;
 
     SYMCRYPT_ASSERT( piDst->nDigits >= pmMod->nDigits );
-    
+
     pData = SYMCRYPT_MOD_CALL( pmMod ) modPreGet( pmMod, peSrc, pbScratch, cbScratch );
 
     SymCryptFdefModElementToIntGeneric( pmMod, pData, piDst, pbScratch, cbScratch );
@@ -762,17 +762,17 @@ SymCryptModElementToInt(
 SYMCRYPT_DISABLE_CFG
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptModElementSetValue( 
-    _In_reads_bytes_( cbSrc )       PCBYTE                  pbSrc, 
-                                    SIZE_T                  cbSrc, 
-                                    SYMCRYPT_NUMBER_FORMAT  format, 
+SymCryptModElementSetValue(
+    _In_reads_bytes_( cbSrc )       PCBYTE                  pbSrc,
+                                    SIZE_T                  cbSrc,
+                                    SYMCRYPT_NUMBER_FORMAT  format,
                                     PCSYMCRYPT_MODULUS      pmMod,
     _Out_                           PSYMCRYPT_MODELEMENT    peDst,
     _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                     SIZE_T                  cbScratch )
 {
     SYMCRYPT_ERROR  scError;
-    
+
     scError = SymCryptFdefModElementSetValueGeneric( pbSrc, cbSrc, format, pmMod, peDst, pbScratch, cbScratch );
 
     if( scError == SYMCRYPT_NO_ERROR )
@@ -785,11 +785,11 @@ SymCryptModElementSetValue(
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
-SymCryptModElementGetValue( 
+SymCryptModElementGetValue(
                                     PCSYMCRYPT_MODULUS      pmMod,
     _In_                            PCSYMCRYPT_MODELEMENT   peSrc,
-    _Out_writes_bytes_( cbDst )     PBYTE                   pbDst, 
-                                    SIZE_T                  cbDst, 
+    _Out_writes_bytes_( cbDst )     PBYTE                   pbDst,
+                                    SIZE_T                  cbDst,
                                     SYMCRYPT_NUMBER_FORMAT  format,
     _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                     SIZE_T                  cbScratch )
@@ -889,8 +889,8 @@ SymCryptModNeg(
 SYMCRYPT_DISABLE_CFG
 VOID
 SYMCRYPT_CALL
-SymCryptModElementSetValueUint32( 
-                                    UINT32                  value, 
+SymCryptModElementSetValueUint32(
+                                    UINT32                  value,
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
     _Out_                           PSYMCRYPT_MODELEMENT    peDst,
     _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
@@ -903,8 +903,8 @@ SymCryptModElementSetValueUint32(
 
 VOID
 SYMCRYPT_CALL
-SymCryptModElementSetValueNegUint32( 
-                                    UINT32                  value, 
+SymCryptModElementSetValueNegUint32(
+                                    UINT32                  value,
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
     _Out_                           PSYMCRYPT_MODELEMENT    peDst,
     _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
@@ -994,7 +994,7 @@ SymCryptCreateTrialDivisionContext( UINT32 nDigits )
 
 UINT32
 SYMCRYPT_CALL
-SymCryptIntFindSmallDivisor( 
+SymCryptIntFindSmallDivisor(
     _In_                            PCSYMCRYPT_TRIALDIVISION_CONTEXT    pContext,
     _In_                            PCSYMCRYPT_INT                      piSrc,
     _Out_writes_bytes_( cbScratch ) PBYTE                               pbScratch,
diff --git a/lib/amd64/aesasm.asm b/lib/amd64/aesasm.asm
deleted file mode 100644
index dbb820a..0000000
--- a/lib/amd64/aesasm.asm
+++ /dev/null
@@ -1,1657 +0,0 @@
-;
-;  AesAsm.asm   Assembler code for fast AES on the amd64
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-; This code is derived from the AesFast implemenation that
-; Niels Ferguson wrote from scratch for BitLocker during Vista.
-; That code is still in RSA32.
-;
-
-include ksamd64.inc
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-        TITLE   "Advanced Encryption Standard (AES)"
-
-USE_BLOCK_FUNCTION      EQU     1               ; Set to 1 to use block function, 0 to use block macro
-
-;
-; Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure.
-;
-
-N_ROUND_KEYS_IN_AESKEY    EQU     29        
-
-SYMCRYPT_AES_EXPANDED_KEY struct
-        RoundKey        dq      2*N_ROUND_KEYS_IN_AESKEY dup (?)        ;
-        lastEncRoundKey dq      ?                                       ; pointer to last enc round key
-        lastDecRoundKey dq      ?                                       ; pointer to last dec round key
-
-        SYMCRYPT_MAGIC_FIELD
-        
-SYMCRYPT_AES_EXPANDED_KEY ends        
-
-
-        extern  SymCryptAesSboxMatrixMult:DWORD
-        extern  SymCryptAesInvSboxMatrixMult:DWORD
-;        extern  SymCryptAesSbox:BYTE                           ; Not used
-        extern  SymCryptAesInvSbox:BYTE
-
-;
-; Shorthand for the 4 tables we will use
-; We always use r11 to point to the (inv) SboxMatrixMult tables
-;
-SMM0    EQU     r11
-SMM1    EQU     r11 + 0400h
-SMM2    EQU     r11 + 0800h
-SMM3    EQU     r11 + 0c00h
-
-ISMM0   EQU     r11
-ISMM1   EQU     r11 + 0400h
-ISMM2   EQU     r11 + 0800h
-ISMM3   EQU     r11 + 0c00h
-
-
-
-
-ENC_MIX MACRO   keyptr
-        ;
-        ; Perform the unkeyed mixing function for encryption
-        ; plus a key addition from the key pointer
-        ;
-        ; input:block is in     eax, ebx, ecx, edx;  r11 points to AesSboxMatrixMult
-        ; New state ends up in  eax, ebx, ecx, edx
-        ; Used registers:       esi, edi, ebp, r8
-
-        ;
-        ; We can use the e<xx> registers for the movzx as the
-        ; upper 32 bits are automatically set to 0. This saves
-        ; prefix bytes
-        ;
-        ; We use 32-bit registers to store the state. 
-        ; We tried using 64-bit registers, but the extra shifts
-        ; cost too much. 
-        ; Using 32-bit throughout makes the key xor more expensive
-        ; but we avoid having to combine the 32-bit halves into
-        ; 64 bit.
-        ;
-
-        movzx   esi,al
-        mov     esi,[SMM0 + 4 * rsi]
-        movzx   edi,ah
-        shr     eax,16
-        mov     r8d,[SMM1 + 4 * rdi]
-        movzx   ebp,al
-        mov     ebp,[SMM2 + 4 * rbp]
-        movzx   edi,ah
-        mov     edi,[SMM3 + 4 * rdi]
-        
-        movzx   eax,bl
-        xor     edi,[SMM0 + 4 * rax]
-        movzx   eax,bh
-        shr     ebx,16
-        xor     esi,[SMM1 + 4 * rax]
-        movzx   eax,bl
-        xor     r8d,[SMM2 + 4 * rax]
-        movzx   eax,bh
-        xor     ebp,[SMM3 + 4 * rax]
-
-        movzx   eax,cl
-        xor     ebp,[SMM0 + 4 * rax]
-        movzx   ebx,ch
-        shr     ecx,16
-        xor     edi,[SMM1 + 4 * rbx]
-        movzx   eax,cl
-        xor     esi,[SMM2 + 4 * rax]
-        movzx   ebx,ch
-        xor     r8d,[SMM3 + 4 * rbx] 
-
-        movzx   eax,dl
-        xor     r8d,[SMM0 + 4 * rax]
-        movzx   ebx,dh
-        shr     edx,16
-        xor     ebp,[SMM1 + 4 * rbx]
-        movzx   eax,dl
-        xor     edi,[SMM2 + 4 * rax]
-        movzx   ebx,dh
-        xor     esi,[SMM3 + 4 * rbx] 
-
-        mov     eax, [keyptr]
-        mov     ebx, [keyptr + 4]
-        xor     eax, esi
-        mov     ecx, [keyptr + 8]
-        xor     ebx, edi
-        mov     edx, [keyptr + 12]
-        xor     ecx, ebp
-        xor     edx, r8d
-
-        ENDM
-
-
-DEC_MIX MACRO   keyptr
-        ;
-        ; Perform the unkeyed mixing function for decryption
-        ;
-        ; input:block is in      eax, ebx, ecx, edx
-        ;       r11 points to AesInvSboxMatrixMult
-        ; New state ends up in   esi, edi, ebp, r8d
-
-        movzx   esi,al
-        mov     esi,[ISMM0 + 4 * rsi]
-        movzx   edi,ah
-        shr     eax,16
-        mov     edi,[ISMM1 + 4 * rdi]
-        movzx   ebp,al
-        mov     ebp,[ISMM2 + 4 * rbp]
-        movzx   eax,ah
-        mov     r8d,[ISMM3 + 4 * rax]
-        
-        movzx   eax,bl
-        xor     edi,[ISMM0 + 4 * rax]
-        movzx   eax,bh
-        shr     ebx,16
-        xor     ebp,[ISMM1 + 4 * rax]
-        movzx   eax,bl
-        xor     r8d,[ISMM2 + 4 * rax]
-        movzx   eax,bh
-        xor     esi,[ISMM3 + 4 * rax]
-
-        movzx   eax,cl
-        xor     ebp,[ISMM0 + 4 * rax]
-        movzx   ebx,ch
-        shr     ecx,16
-        xor     r8d,[ISMM1 + 4 * rbx]
-        movzx   eax,cl
-        xor     esi,[ISMM2 + 4 * rax]
-        movzx   ebx,ch
-        xor     edi,[ISMM3 + 4 * rbx] 
-
-        movzx   eax,dl
-        xor     r8d,[ISMM0 + 4 * rax]
-        movzx   ebx,dh
-        shr     edx,16
-        xor     esi,[ISMM1 + 4 * rbx]
-        movzx   eax,dl
-        xor     edi,[ISMM2 + 4 * rax]
-        movzx   ebx,dh
-        xor     ebp,[ISMM3 + 4 * rbx] 
-
-        mov     eax, [keyptr]
-        mov     ebx, [keyptr + 4]
-        xor     eax, esi
-        mov     ecx, [keyptr + 8]
-        xor     ebx, edi
-        mov     edx, [keyptr + 12]
-        xor     ecx, ebp
-        xor     edx, r8d
-
-        ENDM
-
-
-
-AES_ENCRYPT_MACRO     MACRO
-        ;
-        ; Plaintext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use (modified)
-        ; r10 is last key to use (unchanged)
-        ; r11 points to SboxMatrixMult (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-        ; This macro is free to unroll the cipher completely, or to use a loop
-        ; over r9
-        ;
-
-        ;
-        ; xor in first round key
-        ;        
-        xor     eax,[r9]
-        xor     ebx,[r9+4]
-        xor     ecx,[r9+8]
-        xor     edx,[r9+12]
-
-        ENC_MIX r9+16
-        
-        ENC_MIX r9+32
-        
-        ENC_MIX r9+48
-        
-        ENC_MIX r9+64
-        
-        ENC_MIX r9+80
-        
-        ENC_MIX r9+96
-
-        add     r9,160
-        
-        ENC_MIX r9-48
-        
-        ;align   16
-        
-@@:
-        ; Block is eax, ebx, ecx, edx
-        ; r9-16 points to next round key
-
-        ENC_MIX r9-32
-
-        ENC_MIX r9-16
-        
-        cmp     r9,r10
-        lea     r9,[r9+32]      
-        jc      @B
-
-        ;
-        ; Now for the final round
-        ; We use the fact that SboxMatrixMult[0] table is also
-        ; an Sbox table if you use the second element of each entry.
-        ;
-        ; Result is in esi, edi, ebp, r8d 
-        ; 
-
-        movzx   esi,al
-        movzx   esi,byte ptr[r11 + 1 + 4*rsi]
-        movzx   edi,ah
-        shr     eax,16
-        movzx   r8d,byte ptr[r11 + 1 + 4*rdi]
-        movzx   ebp,al
-        shl     r8d,8
-        movzx   ebp,byte ptr[r11 + 1 + 4*rbp]
-        shl     ebp,16
-        movzx   edi,ah
-        movzx   edi,byte ptr[r11 + 1 + 4*rdi]
-        shl     edi,24
-
-        movzx   eax,bl
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        or      edi,eax
-        movzx   eax,bh
-        shr     ebx,16
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        shl     eax,8
-        or      esi,eax
-        movzx   eax,bl
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        movzx   ebx,bh
-        shl     eax,16
-        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
-        or      r8d,eax
-        shl     ebx,24
-        or      ebp,ebx
-
-        movzx   eax,cl
-        movzx   ebx,ch
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        shr     ecx,16
-        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
-        shl     ebx,8
-        or      ebp,eax
-        or      edi,ebx
-        movzx   eax,cl
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        movzx   ebx,ch
-        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
-        shl     eax,16
-        shl     ebx,24
-        or      esi,eax
-        or      r8d,ebx
-        
-        movzx   eax,dl
-        movzx   ebx,dh
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        shr     edx,16
-        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
-        shl     ebx,8
-        or      r8d,eax
-        or      ebp,ebx
-        movzx   eax,dl
-        movzx   eax,byte ptr[r11 + 1 + 4*rax]
-        movzx   ebx,dh
-        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
-        shl     eax,16
-        shl     ebx,24
-        or      edi,eax
-        or      esi,ebx
-
-        ;
-        ; xor in final round key
-        ;        
-        
-        xor     r8d,[r10+12]
-        xor     esi,[r10]
-        xor     edi,[r10+4]
-        xor     ebp,[r10+8]
-        
-        ENDM
-
-AES_DECRYPT_MACRO     MACRO
-        ;
-        ; Ciphertext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to InvSboxMatrixMult (unchanged)
-        ; r12 points to InvSbox (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-
-
-        ;
-        ; xor in first round key
-        ;        
-        xor     eax,[r9]
-        xor     ebx,[r9+4]
-        xor     ecx,[r9+8]
-        xor     edx,[r9+12]
-        
-        DEC_MIX r9+16
-        
-        DEC_MIX r9+32
-        
-        DEC_MIX r9+48
-        
-        DEC_MIX r9+64
-        
-        DEC_MIX r9+80
-        
-        DEC_MIX r9+96
-
-        add     r9,160
-        
-        DEC_MIX r9-48
-        
-        ;align   16
-        
-@@:
-        ; Block is eax, ebx, ecx, edx
-        ; r9-32 points to next round key
-
-        DEC_MIX r9-32
-
-        DEC_MIX r9-16
-        
-        cmp     r9,r10
-        lea     r9,[r9+32]      
-        jc      @B
-
-        ;
-        ; Now for the final round
-        ; Result is in esi, edi, ebp, r8d 
-        ; 
-
-        movzx   esi,al
-        movzx   esi,byte ptr[r12 + rsi]
-        movzx   edi,ah
-        shr     eax,16
-        movzx   edi,byte ptr[r12 + rdi]
-        movzx   ebp,al
-        shl     edi,8
-        movzx   ebp,byte ptr[r12 + rbp]
-        shl     ebp,16
-        movzx   eax,ah
-        movzx   r8d,byte ptr[r12 + rax]
-        shl     r8d,24
-
-        movzx   eax,bl
-        movzx   eax,byte ptr[r12 + rax]
-        or      edi,eax
-        movzx   eax,bh
-        shr     ebx,16
-        movzx   eax,byte ptr[r12 + rax]
-        shl     eax,8
-        or      ebp,eax
-        movzx   eax,bl
-        movzx   eax,byte ptr[r12 + rax]
-        movzx   ebx,bh
-        shl     eax,16
-        movzx   ebx,byte ptr[r12 + rbx]
-        or      r8d,eax
-        shl     ebx,24
-        or      esi,ebx
-
-        movzx   eax,cl
-        movzx   ebx,ch
-        movzx   eax,byte ptr[r12 + rax]
-        shr     ecx,16
-        movzx   ebx,byte ptr[r12 + rbx]
-        shl     ebx,8
-        or      ebp,eax
-        or      r8d,ebx
-        movzx   eax,cl
-        movzx   eax,byte ptr[r12 + rax]
-        movzx   ebx,ch
-        movzx   ebx,byte ptr[r12 + rbx]
-        shl     eax,16
-        shl     ebx,24
-        or      esi,eax
-        or      edi,ebx
-        
-        movzx   eax,dl
-        movzx   ebx,dh
-        movzx   eax,byte ptr[r12 + rax]
-        shr     edx,16
-        movzx   ebx,byte ptr[r12 + rbx]
-        shl     ebx,8
-        or      r8d,eax
-        or      esi,ebx
-        movzx   eax,dl
-        movzx   eax,byte ptr[r12 + rax]
-        movzx   ebx,dh
-        movzx   ebx,byte ptr[r12 + rbx]
-        shl     eax,16
-        shl     ebx,24
-        or      edi,eax
-        or      ebp,ebx
-
-        ;
-        ; xor in final round key
-        ;        
-        
-        xor     esi,[r10]
-        xor     edi,[r10+4]
-        xor     ebp,[r10+8]
-        xor     r8d,[r10+12]
-
-        ENDM
-
-if 0
-AES_ENCRYPT_XMM MACRO
-        ; xmm0 contains the plaintext
-        ; rcx points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; Ciphertext ends up in xmm0
-        ;
-
-        ;
-        ; xor in first round key; round keys are 16-aligned on amd64
-        ;
-        pxor    xmm0,[rcx]
-        aesenc  xmm0,[rcx+16]
-        
-        aesenc  xmm0,[rcx+32]
-        aesenc  xmm0,[rcx+48]
-        aesenc  xmm0,[rcx+64]
-        aesenc  xmm0,[rcx+80]
-        aesenc  xmm0,[rcx+96]
-        aesenc  xmm0,[rcx+112]        
-        add     rcx, 128
-
-@@:
-        ; r9 points to next round key
-
-        aesenc  xmm0, [rcx]
-        aesenc  xmm0, [rcx+16]
-        
-        add     rcx, 32
-        cmp     rcx,r10
-        jc      @B
-
-        ;
-        ; Now for the final round
-        ;
-        aesenclast      xmm0, [r10]
-       
-        ENDM
-
-
-AES_DECRYPT_XMM MACRO
-        ; xmm0 contains the ciphertext
-        ; rcx points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; Plaintext ends up in xmm0
-        ;
-
-        ;
-        ; xor in first round key; round keys are 16-aligned on amd64
-        ;
-        pxor    xmm0,[rcx]
-        aesdec  xmm0,[rcx+16]
-
-        aesdec  xmm0,[rcx+32]
-        aesdec  xmm0,[rcx+48]
-        aesdec  xmm0,[rcx+64]
-        aesdec  xmm0,[rcx+80]
-        aesdec  xmm0,[rcx+96]
-        aesdec  xmm0,[rcx+112]        
-        add     rcx, 128
-
-@@:
-        ; r9 points to next round key
-
-        aesdec  xmm0, [rcx]
-        aesdec  xmm0, [rcx+16]
-        
-        add     rcx, 32
-        cmp     rcx,r10
-        jc      @B
-
-        ;
-        ; Now for the final round
-        ;
-        aesdeclast      xmm0, [r10]
-       
-       
-        ENDM
-endif
-        
-        IF      USE_BLOCK_FUNCTION
-
-        ;
-        ; We use a block function, the AES_ENCRYPT macro merely calls the function
-        ;
-
-AES_ENCRYPT     MACRO
-        call    SymCryptAesEncryptAsmInternal
-        ENDM
-
-AES_DECRYPT     MACRO
-        call    SYmCryptAesDecryptAsmInternal
-        ENDM
-
-;========================================
-;               SymCryptAesEncryptAsmInternal
-;
-;               Internal AES encryption routine with modified calling convention.
-;       This function has the exact same calling convention as the AES_ENCRYPT_MACRO
-
-
-        LEAF_ENTRY SymCryptAesEncryptAsmInternal, _TEXT
-
-        AES_ENCRYPT_MACRO
-
-        ret
-        
-        LEAF_END SymCryptAesEncryptAsmInternal, _TEXT
-
-
-;========================================
-;       SymCryptAesDecryptAsmInternal
-;
-;       Internal AES encryption routine with modified calling convention.
-;       This function has the exact same calling convention as the AES_DECRYPT_MACRO
-;
-
-
-        LEAF_ENTRY SymCryptAesDecryptAsmInternal, _TEXT
-
-        AES_DECRYPT_MACRO
-        
-        ret
-        
-        LEAF_END SymCryptAesDecryptAsmInternal, _TEXT
-
-
-        ELSE
-
-        ;
-        ; No block function, use the macro directly
-        ;
-
-AES_ENCRYPT     MACRO
-        AES_ENCRYPT_MACRO
-        ENDM
-
-AES_DECRYPT     MACRO
-        AES_DECRYPT_MACRO
-        ENDM
-
-        ENDIF
-
-
-
-;
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesEncrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbPlaintext,
-;                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbCiphertext );
-;
-
-SymCryptAesEncryptFrame struct
-
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-
-SymCryptAesEncryptFrame ends
-
-        NESTED_ENTRY    SymCryptAesEncryptAsm, _TEXT
-
-        ;
-        ; Prologue
-        ; Pushes are as fast as stores and smaller, so we use those
-        ;
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-        
-        ;
-        ; At this point the stack is not properly aligned, but as we only call our own internal function 
-        ; with a modified calling convention this is not a problem. (Interrupt routines can deal with 
-        ; unaligned stack, and the stack _will_ be aligned during the actual AES work.)
-        ; 
-        
-        
-        ; Parameters passed:
-        ; rcx = pExpandedKey
-        ; rdx = pbPlaintext
-        ; r8 = pbCiphertext
-        ;
-
-        mov     r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        mov     r9, rcx
-
-        mov     [rsp + SymCryptAesEncryptFrame.CallerP3Home], r8
-
-        ;
-        ; Load the plaintext
-        ;
-        mov     eax,[rdx     ]
-        mov     ebx,[rdx +  4]
-        mov     ecx,[rdx +  8]
-        mov     edx,[rdx + 12]
-        
-        lea     r11,[SymCryptAesSboxMatrixMult]
-        
-        AES_ENCRYPT
-        ;
-        ; Plaintext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to SboxMatrixMult (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-
-        mov     rdx,[rsp + SymCryptAesEncryptFrame.CallerP3Home]
-        mov     [rdx     ], esi
-        mov     [rdx +  4], edi
-        mov     [rdx +  8], ebp
-        mov     [rdx + 12], r8d
-
-SymCryptAesEncryptAsmDone:
-
-        BEGIN_EPILOGUE
-
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-        ret
-
-
-        NESTED_END      SymCryptAesEncryptAsm, _TEXT
-
-
-;
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesDecrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbCiphertext,
-;                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbPlaintext );
-;
-
-
-        NESTED_ENTRY    SymCryptAesDecryptAsm, _TEXT
-
-SymCryptAesDecryptFrame struct
-
-SaveR12                 dq      ?
-SaveRdi                 dq      ?
-SaveRsi                 dq      ?
-SaveRbp                 dq      ?
-SaveRbx                 dq      ?
-ReturnAddress           dq      ?
-pExpandedKeyHome        dq      ?
-pbCiphertextHome        dq      ?
-pbPlaintextHome         dq      ?
-CallerP4Home            dq      ?
-
-SymCryptAesDecryptFrame ends
-        ;
-        ; Prologue
-        ; Pushes are as fast as stores and smaller, so we use those
-        ;
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r12
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-        
-        ;
-        ; At this point the stack is not properly aligned, but as we only call our own internal function 
-        ; with a modified calling convention this is not a problem. (Interrupt routines can deal with 
-        ; unaligned stack, and the stack _will_ be aligned during the actual AES work.)
-        ; 
-        
-        
-        ; Parameters passed:
-        ; rcx = pExpandedKey
-        ; rdx = pbCiphertext
-        ; r8  = pbPlaintext
-        ;
-        
-        mov     r9,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        mov     r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey]
-
-        mov     [rsp + SymCryptAesDecryptFrame.pbCiphertextHome], r8
-        
-        mov     eax,[rdx]
-        mov     ebx,[rdx+4]
-        mov     ecx,[rdx+8]
-        mov     edx,[rdx+12]
-        
-
-        lea     r11,[SymCryptAesInvSboxMatrixMult]
-        lea     r12,[SymCryptAesInvSbox]
-        
-        AES_DECRYPT
-        ; Ciphertext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to InvSboxMatrixMult (unchanged)
-        ; r12 points to InvSbox (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-
-        mov     rdx,[rsp + SymCryptAesDecryptFrame.pbCiphertextHome]  ; retrieve bpPlaintext 
-        mov     [rdx],esi
-        mov     [rdx+4],edi
-        mov     [rdx+8],ebp
-        mov     [rdx+12],r8d
-
-SymCryptAesDecryptAsmDone:
-
-        BEGIN_EPILOGUE
-
-        pop     r12
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-        ret
-
-        NESTED_END      SymCryptAesDecryptAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesCbcEncrypt( 
-;    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-;    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
-;    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
-;                                            SIZE_T                      cbData );
-
-        NESTED_ENTRY    SymCryptAesCbcEncryptAsm, _TEXT
-
-AesCbcEncryptFrame struct
-
-SaveR15         dq      ?
-SaveR14         dq      ?
-SaveR13         dq      ?
-SaveR12         dq      ?
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-cbData          dq      ?
-
-AesCbcEncryptFrame ends
-
-        ;
-        ; rcx = pExpandedKey
-        ; rdx = pbChainingValue
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; [rsp+28] = cbData
-
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        mov     r15,[rsp + AesCbcEncryptFrame.cbData]
-
-        mov     [rsp + AesCbcEncryptFrame.CallerP2Home], rdx    ; save pbChainingValue
-        
-        mov     r13, r8                 ; r13 = pbSrc
-
-        
-        and     r15, NOT 15
-        jz      SymCryptAesCbcEncryptNoData
-        
-        add     r15, r8
-
-        mov     r14, r9                 ; r14 = pbDst
-        
-        mov     r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]   ; r10 = last enc round key
-        
-        ;
-        ; Load the chaining state from pbChainingValue
-        ;
-        mov     esi,[rdx]
-        mov     edi,[rdx+4]
-        mov     ebp,[rdx+8]
-        mov     r8d,[rdx+12]
-
-
-        mov     r12,rcx                 ; r12 = first round key to use
-
-        lea     r11,[SymCryptAesSboxMatrixMult]
-
-
-        align   16
-SymCryptAesCbcEncryptAsmLoop:   
-        ; Loop register setup
-        ; r10 = last round key to use
-        ; r12 = first round key to use
-        ; r13 = pbSrc
-        ; r14 = pbDst
-        ; r15 = pbSrcEnd
-        
-        ; chaining state in (esi,edi,ebp,r8d)
-
-        mov     eax, [r13]
-        mov     r9, r12
-        mov     ebx, [r13+4]
-        xor     eax, esi
-        mov     ecx, [r13+8]
-        xor     ebx, edi
-        xor     ecx, ebp
-        mov     edx, [r13+12]
-        xor     edx, r8d
-        
-        add     r13, 16
-
-
-        AES_ENCRYPT
-        ;
-        ; Plaintext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to SboxMatrixMult (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-
-        mov     [r14], esi
-        mov     [r14+4], edi
-        mov     [r14+8], ebp
-        mov     [r14+12], r8d
-
-        add     r14, 16
-
-        cmp     r13, r15
-        
-        jb      SymCryptAesCbcEncryptAsmLoop
-
-
-        ;
-        ; Update the chaining value
-        ;
-        mov     rdx,[rsp + AesCbcEncryptFrame.CallerP2Home]
-        mov     [rdx], esi
-        mov     [rdx+4], edi
-        mov     [rdx+8], ebp
-        mov     [rdx+12], r8d
-
-SymCryptAesCbcEncryptNoData:
-SymCryptAesCbcEncryptDone:
-
-        BEGIN_EPILOGUE
-
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-        ret
-
-        NESTED_END      SymCryptAesCbcEncryptAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesCbcDecrypt( 
-;    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-;    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
-;    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
-;                                            SIZE_T                      cbData );
-
-        NESTED_ENTRY    SymCryptAesCbcDecryptAsm, _TEXT
-
-AesCbcDecryptFrame struct
-
-SaveR15         dq      ?
-SaveR14         dq      ?
-SaveR13         dq      ?
-SaveR12         dq      ?
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?       ;Tmp1
-CallerP2Home    dq      ?       ;pbChainingValue
-CallerP3Home    dq      ?       ;pbSrc
-CallerP4Home    dq      ?       ;Tmp2
-cbData          dq      ?
-
-AesCbcDecryptFrame ends
-
-        ;
-        ; rcx = pExpandedKey
-        ; rdx = pbChainingValue
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; [rsp+28] = cbData
-
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        mov     r14,[rsp + AesCbcDecryptFrame.cbData]
-
-        and     r14, NOT 15
-        jz      SymCryptAesCbcDecryptNoData
-
-        mov     r13,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        mov     r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey]   
-
-        mov     [rsp + AesCbcDecryptFrame.CallerP2Home], rdx    ;pbChainingValue
-        mov     [rsp + AesCbcDecryptFrame.CallerP3Home], r8     ;pbSrc
-        sub     r14, 16
-        
-        lea     r15,[r9 + r14]          ; r15 = pbDst pointed to last block
-        add     r14, r8                 ; r14 = pbSrc pointed to last block
-
-        lea     r11,[SymCryptAesInvSboxMatrixMult]
-        lea     r12,[SymCryptAesInvSbox]
-
-        ;
-        ; Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
-        ;
-        mov     eax,[r14]
-        mov     ebx,[r14+4]
-        mov     ecx,[r14+8]
-        mov     edx,[r14+12]
-        
-        mov     dword ptr [rsp + AesCbcDecryptFrame.CallerP1Home], eax
-        mov     dword ptr [rsp + AesCbcDecryptFrame.CallerP1Home+4], ebx
-        mov     dword ptr [rsp + AesCbcDecryptFrame.CallerP4Home], ecx
-        mov     dword ptr [rsp + AesCbcDecryptFrame.CallerP4Home+4], edx
-
-        jmp     SymCryptAesCbcDecryptAsmLoopEntry
-
-        align   16
-        
-SymCryptAesCbcDecryptAsmLoop:   
-        ; Loop register setup
-        ; r13 = first round key to use
-        ; r14 = pbSrc
-        ; r15 = pbDst
-        ; [callerP3Home] = pbSrcStart
-        
-        ; current ciphertext block (esi,edi,ebp,r8d)
-
-        mov     eax,[r14-16]
-        mov     ebx,[r14-12]
-        xor     esi,eax
-        mov     ecx,[r14-8]
-        xor     edi,ebx
-        mov     [r15],esi
-        mov     edx,[r14-4]
-        xor     ebp,ecx
-        mov     [r15+4],edi
-        xor     r8d,edx
-        mov     [r15+8],ebp
-        mov     [r15+12],r8d
-        
-        sub     r14,16
-        sub     r15,16
-
-SymCryptAesCbcDecryptAsmLoopEntry:
-
-        mov     r9, r13
-
-        AES_DECRYPT
-        ;
-        ; Ciphertext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to InvSboxMatrixMult (unchanged)
-        ; r12 points to InvSbox (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-        
-        cmp     r14, [rsp + AesCbcDecryptFrame.CallerP3Home]    ; pbSrc
-        ja      SymCryptAesCbcDecryptAsmLoop
-
-        mov     rbx,[rsp + AesCbcDecryptFrame.CallerP2Home]     ; pbChainingValue
-        xor     esi,[rbx]
-        xor     edi,[rbx+4]
-        xor     ebp,[rbx+8]
-        xor     r8d,[rbx+12]
-        
-        mov     [r15], esi
-        mov     [r15+4], edi
-        mov     [r15+8], ebp
-        mov     [r15+12], r8d
-
-        ;
-        ; Update the chaining value to the last ciphertext block
-        ;
-        mov     rax,[rsp + AesCbcDecryptFrame.CallerP1Home]
-        mov     rcx,[rsp + AesCbcDecryptFrame.CallerP4Home]
-        mov     [rbx], rax
-        mov     [rbx+8], rcx
-
-SymCryptAesCbcDecryptNoData:
-
-        BEGIN_EPILOGUE
-
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-        ret
-
-        NESTED_END      SymCryptAesCbcDecryptAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesCtrMsb64( 
-;    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-;    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
-;    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
-;                                            SIZE_T                      cbData );
-
-        NESTED_ENTRY    SymCryptAesCtrMsb64Asm, _TEXT
-
-AesCtrMsb64Frame struct
-
-SaveR15         dq      ?
-SaveR14         dq      ?
-SaveR13         dq      ?
-SaveR12         dq      ?
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?   ; used to store the first  half of the chaining state
-CallerP4Home    dq      ?   ; used to store the second half of the chaining state
-cbData          dq      ?
-
-AesCtrMsb64Frame ends
-
-        ;
-        ; rcx = pExpandedKey
-        ; rdx = pbChainingValue
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; [rsp+28] = cbData
-
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        mov     r14,[rsp + AesCtrMsb64Frame.cbData]
-        and     r14, NOT 15                     ; only deal with whole # blocks
-        jz      SymCryptAesCtrMsb64NoData
-
-        add     r14, r8     ; cbData + pbSrc = pbSrcEnd
-
-        mov     [rsp + AesCtrMsb64Frame.CallerP2Home], rdx              ; save pbChainingState
-        mov     r12, rcx                                                ; r12 = first round key to use
-        mov     r10,[rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]   ; r10 = last enc round key
-        
-        mov     r13, r8     ; pbSrc
-        mov     r15, r9     ; pbDst
-
-        lea     r11,[SymCryptAesSboxMatrixMult]
-
-        ; 
-        ; Load the chaining state
-        ;
-        mov     rax, [rdx +  0]
-        mov     rcx, [rdx +  8]
-
-        ;
-        ; Store it in our local copy (we have no register free to keep pbChainingState in)
-        ;
-        mov     [rsp + AesCtrMsb64Frame.CallerP3Home + 0], rax
-        mov     [rsp + AesCtrMSb64Frame.CallerP3Home + 8], rcx
-
-        ;
-        ; Move to the right registers
-        ;
-        mov     rbx, rax
-        mov     rdx, rcx
-        shr     rbx, 32
-        shr     rdx, 32
-
-        align   16
-SymCryptAesCtrMsb64AsmLoop:   
-        ; Loop invariant
-        ; Current chaining state is in (eax, ebx, ecx, edx)
-        ; r10 = last round key to use
-        ; r11 = SboxMatrixMult
-        ; r12 = first round key to use
-        ; r13 = pbSrc
-        ; r14 = pbSrcEnd
-        ; r15 = pbDst
-        ; [rsp + CallerP3Home] = 16 bytes chaining state block
-    
-        mov     r9, r12
-
-        AES_ENCRYPT
-        ;
-        ; Plaintext in eax, ebx, ecx, edx
-        ; r9 points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; r11 points to SboxMatrixMult (unchanged)
-        ; Ciphertext ends up in esi, edi, ebp, r8d
-        ;
-
-        ; To improve latency, we FIRST 
-        ; load the chaining state, increment the counter, and write it back.
-        ; leave the state in the (eax, ebx, ecx, edx) registers
-
-        mov     eax,dword ptr [rsp + AesCtrMsb64Frame.CallerP3Home + 0]
-        mov     ebx,dword ptr [rsp + AesCtrMsb64Frame.CallerP3Home + 4]
-        mov     rcx,[rsp + AesCtrMsb64Frame.CallerP3Home + 8 ]
-        bswap   rcx
-        add     rcx, 1
-        bswap   rcx
-        mov     [rsp + AesCtrMsb64Frame.CallerP3Home + 8], rcx
-        mov     rdx, rcx
-        shr     rdx, 32
-
-        ; THEN we process the XOR of the key stream with the data
-        ; This order is faster as we need to have the chaining state done
-        ; before we can proceed, but there are no dependencies on the data result
-        ; So we can loop back to the beginning while the data stream read/writes are
-        ; still in flight.
-        ;
-        ; xor with the source stream
-
-        xor     esi,[r13 + 0 ]
-        xor     edi,[r13 + 4 ]
-        xor     ebp,[r13 + 8 ]
-        xor     r8d,[r13 + 12]
-
-        ; store at the destination
-
-        mov     [r15 + 0], esi
-        mov     [r15 + 4], edi
-        mov     [r15 + 8], ebp
-        mov     [r15 + 12], r8d
-
-        add     r13, 16     ; pbSrc += 16
-        add     r15, 16     ; pbDst += 16
-
-        cmp     r13, r14
-        
-        jb      SymCryptAesCtrMsb64AsmLoop
-
-        ;
-        ; Copy back the chaining value; we only modified the last 8 bytes, so that is all we copy
-        ;
-        mov     rsi,[rsp + AesCtrMsb64Frame.CallerP2Home]   ; pbChainingState
-        mov     [rsi + 8], ecx
-        mov     [rsi + 12], edx
-
-        ;
-        ; Wipe the chaining value on stack
-        ;
-        xor     rax, rax
-        mov     [rsp + AesCtrMsb64Frame.CallerP3Home], rax
-        mov     [rsp + AesCtrMsb64Frame.CallerP4Home], rax
-
-SymCryptAesCtrMsb64NoData:
-
-        BEGIN_EPILOGUE
-
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-        ret
-
-        NESTED_END      SymCryptAesCtrMsb64Asm, _TEXT
-
-
-if 0
-        LEAF_ENTRY    SymCryptAesEncryptXmm, _TEXT
-        ;
-        ; rcx = expanded key
-        ; rdx = pbSrc
-        ; r8 = pbDst
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        movups  xmm0,[rdx]
-        mov     r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        
-        
-        AES_ENCRYPT_XMM
-        ; xmm0 contains the plaintext
-        ; rcx points to first round key to use
-        ; r10 is last key to use (unchanged)
-        
-        movups  [r8],xmm0
-
-        ret
-        
-        LEAF_END      SymCryptAesEncryptXmm, _TEXT
-endif
-
-if 0
-
-        LEAF_ENTRY    SymCryptAesDecryptXmm, _TEXT
-        ;
-        ; rcx = expanded key
-        ; rdx = pbSrc
-        ; r8 = pbDst
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        movups  xmm0,[rdx]
-        mov     r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey]
-        mov     rcx, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        
-        
-        AES_DECRYPT_XMM
-        ; xmm0 contains the plaintext
-        ; rcx points to first round key to use
-        ; r10 is last key to use (unchanged)
-        
-        movups  [r8],xmm0
-
-        ret
-        
-        LEAF_END      SymCryptAesDecryptXmm, _TEXT
-endif
-
-if 0
-
-        LEAF_ENTRY      SymCryptAesCbcEncryptXmm, _TEXT
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesCbcEncrypt( 
-;    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-;    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
-;    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
-;                                            SIZE_T                      cbData );
-
-SymCryptAesCbcEncryptXmmFrame struct
-
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-cbData          dq      ?
-
-SymCryptAesCbcEncryptXmmFrame ends
-
-        ; rcx = expanded key
-        ; rdx = pbChainingValue
-        ; r8 = pbSrc
-        ; r9 = pbDst
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-        
-        mov     rax,[rsp + SymCryptAesCbcEncryptXmmFrame.cbData]
-        mov     r11,rcx                 ; first round key
-        and     rax, NOT 15
-        jz      SymCryptAesCbcEncryptXmmDone
-
-        ; [rsp + 40] = cbData
-
-        mov     r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-
-        add     rax, r8                 ; rax = pbSrcEnd
-        
-        movups  xmm0,[rdx]
-
-SymCryptAesCbcEncryptAsmXmmLoop:
-        movups  xmm1,[r8]
-        add     r8,16
-        
-        pxor    xmm0,xmm1
-        
-        mov     rcx, r11
-
-        AES_ENCRYPT_XMM
-        ; xmm0 contains the plaintext
-        ; rcx points to first round key to use
-        ; r10 is last key to use (unchanged)
-        ; Ciphertext ends up in xmm0
-
-        movups  [r9],xmm0
-        add     r9, 16
-        cmp     r8, rax
-        jb      SymCryptAesCbcEncryptAsmXmmLoop
-
-        movups  [rdx],xmm0
-
-SymCryptAesCbcEncryptXmmDone:
-
-        ret
-        
-        LEAF_END        SymCryptAesCbcEncryptXmm, _TEXT
-
-endif
-
-if 0    ; Replaced with C code using intrinics.
-
-        LEAF_ENTRY      SymCryptAesDecryptXmm4, _TEXT
-        ; decrypt xmm0-3 
-        ; Registers used: xmm4
-        ; rcx = first key, r10 = last key
-        ; rcx is destroyed
-
-        movaps  xmm4,[rcx]
-        lea     rcx, [rcx+16]
-        pxor    xmm0, xmm4
-        pxor    xmm1, xmm4
-        pxor    xmm2, xmm4
-        pxor    xmm3, xmm4
-
-@@:     movaps  xmm4,[rcx]
-        add     rcx,16
-        aesdec  xmm0, xmm4
-        aesdec  xmm1, xmm4
-        aesdec  xmm2, xmm4
-        aesdec  xmm3, xmm4
-
-        cmp     rcx, r10
-        jc      @B
-
-        movaps  xmm4,[r10]
-        
-        aesdeclast      xmm0, xmm4
-        aesdeclast      xmm1, xmm4
-        aesdeclast      xmm2, xmm4
-        aesdeclast      xmm3, xmm4
-       
-        ret     
-
-        LEAF_END        SymCryptAesDecryptXmm4, _TEXT
-
-
-
-        NESTED_ENTRY    SymCryptAesCbcDecryptXmm, _TEXT
-;VOID
-;SYMCRYPT_CALL
-;SymCryptAesCbcDecrypt( 
-;    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-;    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-;    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
-;    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
-;                                            SIZE_T                      cbData )
-
-SymCryptAesCbcDecryptXmmFrame struct
-
-SaveXmm9        dq      2 dup (?)
-SaveXmm8        dq      2 dup (?)
-SaveXmm7        dq      2 dup (?)
-SaveXmm6        dq      2 dup (?)
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-cbData          dq      ?
-
-SymCryptAesCbcDecryptXmmFrame ends
-
-
-        rex_push_reg    rbx
-        alloc_stack     SymCryptAesCbcDecryptXmmFrame.SaveRbx
-        save_xmm128     xmm6, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm6
-        save_xmm128     xmm7, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm7
-        save_xmm128     xmm8, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm8
-        save_xmm128     xmm9, oword ptr SymCryptAesCbcDecryptXmmFrame.SaveXmm9
-
-        END_PROLOGUE
-
-        SYMCRYPT_CHECK_MAGIC    rcx, SYMCRYPT_AES_EXPANDED_KEY
-
-        ; rcx = key
-        ; rdx = chaining value
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; [rsp + cbData] = cbData
-
-        mov     rbx,[rsp + SymCryptAesCbcDecryptXmmFrame.cbData]
-        and     rbx, NOT 15
-        jz      SymCryptAesCbcDecryptXmmNoData
-        
-
-        xor     rax, rax                ; offset into buffers
-        
-        mov     r10, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastDecRoundKey]
-        mov     r11, [rcx + SYMCRYPT_AES_EXPANDED_KEY.lastEncRoundKey]
-        
-        movups  xmm5, [rdx]             ; load IV
-
-
-        sub     rbx, 64                 ; cbData - 64
-        jc      SymCryptAesCbcDecryptXmmPartial
-        
-SymCryptAesCbcDecryptXmm4Loop:
-        ;
-        ; xmm5 = IV
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; rax = offset into buffer; we will process bytes rax..rax+63 in this iteration
-        ; rbx = cbData - 64
-        ; rax <= rbx
-        ; 
-
-        movups  xmm0,[r8 + rax]
-        movups  xmm1,[r8 + rax + 16]
-        movaps  xmm6, xmm0
-        movups  xmm2,[r8 + rax + 32]
-        movaps  xmm7, xmm1
-        movups  xmm3,[r8 + rax + 48]
-        movaps  xmm8, xmm2
-        movaps  xmm9, xmm3
-
-        mov     rcx, r11
-        call    SymCryptAesDecryptXmm4  ; decrypt xmm0-3 using xmm4.  rcx = first key, r10 = last key
-
-        pxor    xmm0, xmm5
-        pxor    xmm1, xmm6
-        movups  [r9 + rax], xmm0
-        pxor    xmm2, xmm7
-        movups  [r9 + rax + 16], xmm1
-        pxor    xmm3, xmm8
-        movups  [r9 + rax + 32], xmm2
-        movups  [r9 + rax + 48], xmm3
-        
-        movaps  xmm5, xmm9
-
-        add     rax,64
-        cmp     rax,rbx
-        jbe     SymCryptAesCbcDecryptXmm4Loop
-
-        test    rbx,63
-        jz      SymCryptAesCbcDecryptXmmDone    ; cbData was a multiple of 64, no partial block
-
-        sub     rbx, rax                        ; rbx = bytes left - 64
-
-SymCryptAesCbcDecryptXmmPartial:
-        ;
-        ; r8 = pbSrc
-        ; r9 = pbDst
-        ; rax = current offset 
-        ; rbx = # bytes left to do - 64, # bytes left is nonzero
-        ;
-        
-        movups  xmm0,[r8+rax]
-        movaps  xmm6,xmm0
-        cmp     rbx,16 - 64
-        jz      SymCryptAesCbcDecryptXmmPartialLoadDone
-
-        movups  xmm1,[r8+rax+16]
-        movaps  xmm7,xmm1
-        cmp     rbx,32 - 64
-        jz      SymCRyptAesCbcDecryptXmmPartialLoadDone
-
-        movups  xmm2,[r8+rax+32]
-        movaps  xmm8, xmm2
-
-SymCryptAesCbcDecryptXmmPartialLoadDone:
-
-        mov     rcx,r11
-        call    SymCryptAesDecryptXmm4
-
-        pxor    xmm0, xmm5
-        movups  [r9 + rax], xmm0
-        movaps  xmm5, xmm6
-        cmp     rbx, 16 - 64
-        jz      SymCryptAesCbcDecryptXmmDone
-
-        pxor    xmm1, xmm6
-        movups  [r9 + rax + 16], xmm1
-        movaps  xmm5, xmm7
-        cmp     rbx, 32 - 64
-        jz      SymCryptAesCbcDecryptXmmDone
-
-        pxor    xmm2, xmm7
-        movups  [r9 + rax + 32], xmm2
-        movaps  xmm5, xmm8
-
-SymCryptAesCbcDecryptXmmDone:
-        movups  [rdx], xmm5
-
-SymCryptAesCbcDecryptXmmNoData:
-
-        movaps  xmm6, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm6]
-        movaps  xmm7, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm7]
-        movaps  xmm8, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm8]
-        movaps  xmm9, oword ptr [rsp + SymCryptAesCbcDecryptXmmFrame.SaveXmm9]
-
-        add     rsp,SymCryptAesCbcDecryptXmmFrame.SaveRbx
-
-        BEGIN_EPILOGUE
-
-        pop     rbx
-        ret
-        
-        NESTED_END      SymCryptAesCbcDecryptXmm, _TEXT
-
-endif
-
-if 0    ; No longer used; replaced with C code using intrinsics that can be inlined.
-;
-;VOID
-;SymCryptAes4SboxXmm( _In_reads_bytes_(4) PCBYTE pIn, _Out_writes_bytes_(4) PBYTE pOut );
-;
-        LEAF_ENTRY SymCryptAes4SboxXmm, _TEXT
-        ;
-        ;rcx points to source 
-        ;rdx points to destination
-        ;
-        ;We only use volatile registers so we do not have to save any registers.
-        ;
-
-        mov     eax,[rcx]       ; Use a register to avoid alignment issues
-        movd    xmm0, eax
-
-        movsldup        xmm0, xmm0      ; copy [31:0] to [63:32]
-        aeskeygenassist xmm0, xmm0, 0
-
-        movd    eax, xmm0
-        mov     [rdx], eax
-
-        ret
-
-        LEAF_END SymCryptAes4SboxXmm, _TEXT
-
-
-;
-;VOID
-;AesCreateDecryptionRoundKeyXmm( _In_reads_bytes_(16) PCBYTE pEncryptionRoundKey, 
-;                                _Out_writes_bytes_(16) PBYTE pDecryptionRoundKey );
-;
-        LEAF_ENTRY      SymCryptAesCreateDecryptionRoundKeyXmm, _TEXT
-        ;rcx points to source
-        ;rdx points to destination
-
-        movups  xmm0,[rcx]
-        aesimc  xmm0, xmm0
-        movups  [rdx], xmm0
-        ret
-
-        LEAF_END        SymCryptAesCreateDecryptionRoundKeyXmm, _TEXT
-
-endif
-
-        end
-
diff --git a/lib/amd64/aesasm.symcryptasm b/lib/amd64/aesasm.symcryptasm
new file mode 100644
index 0000000..7c7eded
--- /dev/null
+++ b/lib/amd64/aesasm.symcryptasm
@@ -0,0 +1,964 @@
+//
+//  aesasm.symcryptasm   Assembler code for fast AES on the amd64
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+// This code is derived from the AesFast implementation that
+// Niels Ferguson wrote from scratch for BitLocker during Vista.
+// That code is still in RSA32.
+//
+
+// This file has only been partially translated into symcryptasm, external function calls use the
+// generic symcryptasm registers to convert different calling conventions into using the fixed register
+// layout used in aesasm. It seems likely that changing which registers AES state will be kept in in
+// the macros could impact on performance.
+// In general we don't want to touch this code going forward; the vast majority of amd64 CPUs have aesni
+// and use the Xmm Aes codepaths.
+
+#include "symcryptasm_shared.cppasm"
+
+#include "symcrypt_version.inc"
+
+#define USE_BLOCK_FUNCTION 1    // Set to 1 to use block function, 0 to use block macro
+
+#if defined(SYMCRYPT_MASM)
+extern  SymCryptAesSboxMatrixMult:DWORD
+extern  SymCryptAesInvSboxMatrixMult:DWORD
+extern  SymCryptAesInvSbox:BYTE
+extern  SymCryptFatal:NEAR
+
+#elif defined(SYMCRYPT_GAS)
+
+#else
+#error Unknown target assembly
+#endif
+
+#if DBG
+SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR ))
+SET(SYMCRYPT_MAGIC_CONSTANT, (HEX(53316D76) + SYMCRYPT_CODE_VERSION)) // 0x53316D76 == 'S1mv'
+
+MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
+        mov     rax, [ptr + struct_magic_offset]
+        sub     rax, ptr
+        cmp     rax, SYMCRYPT_MAGIC_CONSTANT
+        jz      check_magic_label
+        mov     arg_1, HEX(6D616763) // 0x6D616763 == 'magc'
+        call    SymCryptFatal
+check_magic_label:
+MACRO_END()
+#else
+MACRO_START(SYMCRYPT_CHECK_MAGIC, check_magic_label, ptr, struct_magic_offset, arg_1)
+MACRO_END()
+#endif
+
+//
+// Structure definition that mirrors the SYMCRYPT_AES_EXPANDED_KEY structure.
+//
+
+// SYMCRYPT_AES_EXPANDED_KEY struct
+//         RoundKey        dq      2*N_ROUND_KEYS_IN_AESKEY dup (?)        //
+//         lastEncRoundKey dq      ?                                       // pointer to last enc round key
+//         lastDecRoundKey dq      ?                                       // pointer to last dec round key
+//         SYMCRYPT_MAGIC_FIELD
+// SYMCRYPT_AES_EXPANDED_KEY ends
+
+SET(N_ROUND_KEYS_IN_AESKEY, 29)
+SET(lastEncRoundKeyOffset, (29*16))
+SET(lastDecRoundKeyOffset, (29*16 + 8))
+SET(magicFieldOffset, (29*16 + 8 + 8))
+
+//
+// Shorthand for the 4 tables we will use
+// We always use r11 to point to the (inv) SboxMatrixMult tables
+//
+#define SMM0  (r11 +    0)
+#define SMM1  (r11 + 1024)
+#define SMM2  (r11 + 2048)
+#define SMM3  (r11 + 3072)
+
+#define ISMM0 (r11 +    0)
+#define ISMM1 (r11 + 1024)
+#define ISMM2 (r11 + 2048)
+#define ISMM3 (r11 + 3072)
+
+MACRO_START(ENC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for encryption
+        // plus a key addition from the key pointer
+        //
+        // input:block is in     eax, ebx, ecx, edx -  r11 points to AesSboxMatrixMult
+        // New state ends up in  eax, ebx, ecx, edx
+        // Used registers:       esi, edi, ebp, r8
+
+        //
+        // We can use the e<xx> registers for the movzx as the
+        // upper 32 bits are automatically set to 0. This saves
+        // prefix bytes
+        //
+        // We use 32-bit registers to store the state.
+        // We tried using 64-bit registers, but the extra shifts
+        // cost too much.
+        // Using 32-bit throughout makes the key xor more expensive
+        // but we avoid having to combine the 32-bit halves into
+        // 64 bit.
+        //
+
+        movzx   esi,al
+        mov     esi,[SMM0 + 4 * rsi]
+        movzx   edi,ah
+        shr     eax,16
+        mov     r8d,[SMM1 + 4 * rdi]
+        movzx   ebp,al
+        mov     ebp,[SMM2 + 4 * rbp]
+        movzx   edi,ah
+        mov     edi,[SMM3 + 4 * rdi]
+
+        movzx   eax,bl
+        xor     edi,[SMM0 + 4 * rax]
+        movzx   eax,bh
+        shr     ebx,16
+        xor     esi,[SMM1 + 4 * rax]
+        movzx   eax,bl
+        xor     r8d,[SMM2 + 4 * rax]
+        movzx   eax,bh
+        xor     ebp,[SMM3 + 4 * rax]
+
+        movzx   eax,cl
+        xor     ebp,[SMM0 + 4 * rax]
+        movzx   ebx,ch
+        shr     ecx,16
+        xor     edi,[SMM1 + 4 * rbx]
+        movzx   eax,cl
+        xor     esi,[SMM2 + 4 * rax]
+        movzx   ebx,ch
+        xor     r8d,[SMM3 + 4 * rbx]
+
+        movzx   eax,dl
+        xor     r8d,[SMM0 + 4 * rax]
+        movzx   ebx,dh
+        shr     edx,16
+        xor     ebp,[SMM1 + 4 * rbx]
+        movzx   eax,dl
+        xor     edi,[SMM2 + 4 * rax]
+        movzx   ebx,dh
+        xor     esi,[SMM3 + 4 * rbx]
+
+        mov     eax, [keyptr]
+        mov     ebx, [keyptr + 4]
+        xor     eax, esi
+        mov     ecx, [keyptr + 8]
+        xor     ebx, edi
+        mov     edx, [keyptr + 12]
+        xor     ecx, ebp
+        xor     edx, r8d
+MACRO_END()
+
+
+MACRO_START(DEC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for decryption
+        //
+        // input:block is in      eax, ebx, ecx, edx
+        //       r11 points to AesInvSboxMatrixMult
+        // New state ends up in   esi, edi, ebp, r8d
+
+        movzx   esi,al
+        mov     esi,[ISMM0 + 4 * rsi]
+        movzx   edi,ah
+        shr     eax,16
+        mov     edi,[ISMM1 + 4 * rdi]
+        movzx   ebp,al
+        mov     ebp,[ISMM2 + 4 * rbp]
+        movzx   eax,ah
+        mov     r8d,[ISMM3 + 4 * rax]
+
+        movzx   eax,bl
+        xor     edi,[ISMM0 + 4 * rax]
+        movzx   eax,bh
+        shr     ebx,16
+        xor     ebp,[ISMM1 + 4 * rax]
+        movzx   eax,bl
+        xor     r8d,[ISMM2 + 4 * rax]
+        movzx   eax,bh
+        xor     esi,[ISMM3 + 4 * rax]
+
+        movzx   eax,cl
+        xor     ebp,[ISMM0 + 4 * rax]
+        movzx   ebx,ch
+        shr     ecx,16
+        xor     r8d,[ISMM1 + 4 * rbx]
+        movzx   eax,cl
+        xor     esi,[ISMM2 + 4 * rax]
+        movzx   ebx,ch
+        xor     edi,[ISMM3 + 4 * rbx]
+
+        movzx   eax,dl
+        xor     r8d,[ISMM0 + 4 * rax]
+        movzx   ebx,dh
+        shr     edx,16
+        xor     esi,[ISMM1 + 4 * rbx]
+        movzx   eax,dl
+        xor     edi,[ISMM2 + 4 * rax]
+        movzx   ebx,dh
+        xor     ebp,[ISMM3 + 4 * rbx]
+
+        mov     eax, [keyptr]
+        mov     ebx, [keyptr + 4]
+        xor     eax, esi
+        mov     ecx, [keyptr + 8]
+        xor     ebx, edi
+        mov     edx, [keyptr + 12]
+        xor     ecx, ebp
+        xor     edx, r8d
+MACRO_END()
+
+MACRO_START(AES_ENCRYPT_MACRO, AesEncryptMacroLoopLabel)
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use (modified)
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+        // This macro is free to unroll the cipher completely, or to use a loop
+        // over r9
+        //
+
+        //
+        // xor in first round key
+        //
+        xor     eax,[r9]
+        xor     ebx,[r9+4]
+        xor     ecx,[r9+8]
+        xor     edx,[r9+12]
+
+        add     r9,32
+
+        // Do not unroll the loop at all because very few CPUs use this codepath so it's worth
+        // minimizing the binary size
+
+AesEncryptMacroLoopLabel:
+        // Block is eax, ebx, ecx, edx
+        // r9-16 points to next round key
+
+        ENC_MIX r9-16
+
+        cmp     r9,r10
+        lea     r9,[r9+16]
+        jc      AesEncryptMacroLoopLabel
+
+        //
+        // Now for the final round
+        // We use the fact that SboxMatrixMult[0] table is also
+        // an Sbox table if you use the second element of each entry.
+        //
+        // Result is in esi, edi, ebp, r8d
+        //
+
+        movzx   esi,al
+        movzx   esi,byte ptr[r11 + 1 + 4*rsi]
+        movzx   edi,ah
+        shr     eax,16
+        movzx   r8d,byte ptr[r11 + 1 + 4*rdi]
+        movzx   ebp,al
+        shl     r8d,8
+        movzx   ebp,byte ptr[r11 + 1 + 4*rbp]
+        shl     ebp,16
+        movzx   edi,ah
+        movzx   edi,byte ptr[r11 + 1 + 4*rdi]
+        shl     edi,24
+
+        movzx   eax,bl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        or      edi,eax
+        movzx   eax,bh
+        shr     ebx,16
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shl     eax,8
+        or      esi,eax
+        movzx   eax,bl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,bh
+        shl     eax,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        or      r8d,eax
+        shl     ebx,24
+        or      ebp,ebx
+
+        movzx   eax,cl
+        movzx   ebx,ch
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shr     ecx,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     ebx,8
+        or      ebp,eax
+        or      edi,ebx
+        movzx   eax,cl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,ch
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      esi,eax
+        or      r8d,ebx
+
+        movzx   eax,dl
+        movzx   ebx,dh
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        shr     edx,16
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     ebx,8
+        or      r8d,eax
+        or      ebp,ebx
+        movzx   eax,dl
+        movzx   eax,byte ptr[r11 + 1 + 4*rax]
+        movzx   ebx,dh
+        movzx   ebx,byte ptr[r11 + 1 + 4*rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      edi,eax
+        or      esi,ebx
+
+        //
+        // xor in final round key
+        //
+
+        xor     r8d,[r10+12]
+        xor     esi,[r10]
+        xor     edi,[r10+4]
+        xor     ebp,[r10+8]
+MACRO_END()
+
+MACRO_START(AES_DECRYPT_MACRO, AesDecryptMacroLoopLabel)
+        //
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        //
+        // xor in first round key
+        //
+        xor     eax,[r9]
+        xor     ebx,[r9+4]
+        xor     ecx,[r9+8]
+        xor     edx,[r9+12]
+
+        add     r9,32
+
+        // Do not unroll the loop at all because very few CPUs use this codepath so it's worth
+        // minimizing the binary size
+AesDecryptMacroLoopLabel:
+        // Block is eax, ebx, ecx, edx
+        // r9-16 points to next round key
+
+        DEC_MIX r9-16
+
+        cmp     r9,r10
+        lea     r9,[r9+16]
+        jc      AesDecryptMacroLoopLabel
+
+        //
+        // Now for the final round
+        // Result is in esi, edi, ebp, r8d
+        //
+
+        movzx   esi,al
+        movzx   esi,byte ptr[r12 + rsi]
+        movzx   edi,ah
+        shr     eax,16
+        movzx   edi,byte ptr[r12 + rdi]
+        movzx   ebp,al
+        shl     edi,8
+        movzx   ebp,byte ptr[r12 + rbp]
+        shl     ebp,16
+        movzx   eax,ah
+        movzx   r8d,byte ptr[r12 + rax]
+        shl     r8d,24
+
+        movzx   eax,bl
+        movzx   eax,byte ptr[r12 + rax]
+        or      edi,eax
+        movzx   eax,bh
+        shr     ebx,16
+        movzx   eax,byte ptr[r12 + rax]
+        shl     eax,8
+        or      ebp,eax
+        movzx   eax,bl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,bh
+        shl     eax,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        or      r8d,eax
+        shl     ebx,24
+        or      esi,ebx
+
+        movzx   eax,cl
+        movzx   ebx,ch
+        movzx   eax,byte ptr[r12 + rax]
+        shr     ecx,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     ebx,8
+        or      ebp,eax
+        or      r8d,ebx
+        movzx   eax,cl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,ch
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      esi,eax
+        or      edi,ebx
+
+        movzx   eax,dl
+        movzx   ebx,dh
+        movzx   eax,byte ptr[r12 + rax]
+        shr     edx,16
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     ebx,8
+        or      r8d,eax
+        or      esi,ebx
+        movzx   eax,dl
+        movzx   eax,byte ptr[r12 + rax]
+        movzx   ebx,dh
+        movzx   ebx,byte ptr[r12 + rbx]
+        shl     eax,16
+        shl     ebx,24
+        or      edi,eax
+        or      ebp,ebx
+
+        //
+        // xor in final round key
+        //
+
+        xor     esi,[r10]
+        xor     edi,[r10+4]
+        xor     ebp,[r10+8]
+        xor     r8d,[r10+12]
+MACRO_END()
+
+#if USE_BLOCK_FUNCTION
+
+        //
+        // We use a block function, the AES_ENCRYPT macro merely calls the function
+        //
+
+MACRO_START(AES_ENCRYPT, loopLabel)
+        call    SymCryptAesEncryptAsmInternal
+MACRO_END()
+
+MACRO_START(AES_DECRYPT, loopLabel)
+        call    SymCryptAesDecryptAsmInternal
+MACRO_END()
+
+//========================================
+//       SymCryptAesEncryptAsmInternal
+//
+//       Internal AES encryption routine with modified calling convention.
+//       This function has the exact same calling convention as the AES_ENCRYPT_MACRO
+
+FUNCTION_START(SymCryptAesEncryptAsmInternal, 0, 0)
+
+        AES_ENCRYPT_MACRO SymCryptAesEncryptAsmInternalLoop
+
+FUNCTION_END(SymCryptAesEncryptAsmInternal)
+
+//========================================
+//       SymCryptAesDecryptAsmInternal
+//
+//       Internal AES encryption routine with modified calling convention.
+//       This function has the exact same calling convention as the AES_DECRYPT_MACRO
+//
+
+FUNCTION_START(SymCryptAesDecryptAsmInternal, 0, 0)
+
+        AES_DECRYPT_MACRO SymCryptAesDecryptAsmInternalLoop
+
+FUNCTION_END(SymCryptAesDecryptAsmInternal)
+
+#else
+
+        //
+        // No block function, use the macro directly
+        //
+
+MACRO_START(AES_ENCRYPT, loopLabel)
+        AES_ENCRYPT_MACRO loopLabel
+MACRO_END()
+
+MACRO_START(AES_DECRYPT, loopLabel)
+        AES_DECRYPT_MACRO loopLabel
+MACRO_END()
+
+#endif
+
+//
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesEncrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbPlaintext,
+//                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbCiphertext )
+//
+
+NESTED_FUNCTION_START(SymCryptAesEncryptAsm, 3, 15)
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy it to
+        // the place it is needed internally in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
+        //      rcx, rdx, r8, rdi, rsi
+
+        mov     r10, [Q1 + lastEncRoundKeyOffset]
+        mov     r9, Q1
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
+
+        //
+        // Load the plaintext
+        //
+        mov     eax,[Q2     ]
+        mov     ebx,[Q2 +  4]
+        mov     ecx,[Q2 +  8]
+        mov     edx,[Q2 + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+        AES_ENCRYPT SymCryptAesEncryptAsmLoop
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+
+        // retrieve pbCiphertext using Q0 because it is always rax regardless of calling convention
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0     ], esi
+        mov     [Q0 +  4], edi
+        mov     [Q0 +  8], ebp
+        mov     [Q0 + 12], r8d
+
+NESTED_FUNCTION_END(SymCryptAesEncryptAsm)
+
+
+//
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesDecrypt( _In_                                   PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )  PCBYTE                      pbCiphertext,
+//                    _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbPlaintext )
+
+NESTED_FUNCTION_START(SymCryptAesDecryptAsm, 3, 15)
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 3 argument registers are:
+        //      rcx, rdx, r8, rdi, rsi
+
+        mov     r9,[Q1 + lastEncRoundKeyOffset]
+        mov     r10,[Q1 + lastDecRoundKeyOffset]
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
+
+        mov     eax,[Q2     ]
+        mov     ebx,[Q2 +  4]
+        mov     ecx,[Q2 +  8]
+        mov     edx,[Q2 + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
+        lea     r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
+
+        AES_DECRYPT SymCryptAesDecryptAsmLoop
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+
+        // retrieve pbPlaintext using Q0 because it is always rax regardless of calling convention
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0     ], esi
+        mov     [Q0 +  4], edi
+        mov     [Q0 +  8], ebp
+        mov     [Q0 + 12], r8d
+
+NESTED_FUNCTION_END(SymCryptAesDecryptAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCbcEncrypt(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCbcEncryptAsm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCbcEncryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15      // only deal with whole # blocks
+        jz      SymCryptAesCbcEncryptNoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2    // save pbChainingValue
+        mov     rax, Q2                 // rax = pbChainingValue
+        mov     r13, Q3                 // r13 = pbSrc
+
+        mov     r15, Q5                 // r15 = cbData
+        mov     r14, Q4                 // r14 = pbDst
+
+        add     r15, Q3                 // r15 = pbSrcEnd
+
+        mov     r10,[Q1 + lastEncRoundKeyOffset]    // r10 = last enc round key
+        mov     r12,Q1                              // r12 = first round key to use
+
+        //
+        // Load the chaining state from pbChainingValue
+        //
+        mov     esi,[rax     ]
+        mov     edi,[rax +  4]
+        mov     ebp,[rax +  8]
+        mov     r8d,[rax + 12]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+ALIGN(16)
+SymCryptAesCbcEncryptAsmLoop:
+        // Loop register setup
+        // r10 = last round key to use
+        // r12 = first round key to use
+        // r13 = pbSrc
+        // r14 = pbDst
+        // r15 = pbSrcEnd
+
+        // chaining state in (esi,edi,ebp,r8d)
+
+        mov     eax, [r13]
+        mov     r9, r12
+        mov     ebx, [r13+4]
+        xor     eax, esi
+        mov     ecx, [r13+8]
+        xor     ebx, edi
+        xor     ecx, ebp
+        mov     edx, [r13+12]
+        xor     edx, r8d
+
+        add     r13, 16
+
+
+        AES_ENCRYPT SymCryptAesCbcEncryptAsmInnerLoop
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        mov     [r14], esi
+        mov     [r14+4], edi
+        mov     [r14+8], ebp
+        mov     [r14+12], r8d
+
+        add     r14, 16
+
+        cmp     r13, r15
+
+        jb      SymCryptAesCbcEncryptAsmLoop
+
+
+        //
+        // Update the chaining value
+        //
+        mov     Q0,[rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     [Q0], esi
+        mov     [Q0+4], edi
+        mov     [Q0+8], ebp
+        mov     [Q0+12], r8d
+
+SymCryptAesCbcEncryptNoData:
+
+NESTED_FUNCTION_END(SymCryptAesCbcEncryptAsm)
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCbcDecrypt(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCbcDecryptAsm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCbcDecryptAsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15
+        jz      SymCryptAesCbcDecryptNoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2   // save pbChainingValue
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], Q3   // save pbSrc
+
+        lea     r14, [Q5 - 16]
+        lea     r15, [Q4 + r14]         // r15 = pbDst pointed to last block
+        add     r14, Q3                 // r14 = pbSrc pointed to last block
+
+        mov     r13,[Q1 + lastEncRoundKeyOffset]
+        mov     r10,[Q1 + lastDecRoundKeyOffset]
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesInvSboxMatrixMult)]
+        lea     r12,[GET_SYMBOL_ADDRESS(SymCryptAesInvSbox)]
+
+        //
+        // Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
+        //
+        mov     eax,[r14]
+        mov     ebx,[r14+4]
+        mov     ecx,[r14+8]
+        mov     edx,[r14+12]
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)  ], eax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)+4], ebx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)  ], ecx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)+4], edx
+
+        jmp     SymCryptAesCbcDecryptAsmLoopEntry
+
+ALIGN(16)
+
+SymCryptAesCbcDecryptAsmLoop:
+        // Loop register setup
+        // r13 = first round key to use
+        // r14 = pbSrc
+        // r15 = pbDst
+        // [slot1] = pbSrcStart
+
+        // current ciphertext block (esi,edi,ebp,r8d)
+
+        mov     eax,[r14-16]
+        mov     ebx,[r14-12]
+        xor     esi,eax
+        mov     ecx,[r14-8]
+        xor     edi,ebx
+        mov     [r15],esi
+        mov     edx,[r14-4]
+        xor     ebp,ecx
+        mov     [r15+4],edi
+        xor     r8d,edx
+        mov     [r15+8],ebp
+        mov     [r15+12],r8d
+
+        sub     r14,16
+        sub     r15,16
+
+SymCryptAesCbcDecryptAsmLoopEntry:
+
+        mov     r9, r13
+
+        AES_DECRYPT SymCryptAesCbcDecryptAsmInnerLoop
+        //
+        // Ciphertext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to InvSboxMatrixMult (unchanged)
+        // r12 points to InvSbox (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        cmp     r14, [rsp + GET_MEMSLOT_OFFSET(slot1)]  // pbSrc
+        ja      SymCryptAesCbcDecryptAsmLoop
+
+        mov     rbx,[rsp + GET_MEMSLOT_OFFSET(slot0)]   // pbChainingValue
+        xor     esi,[rbx]
+        xor     edi,[rbx+4]
+        xor     ebp,[rbx+8]
+        xor     r8d,[rbx+12]
+
+        mov     [r15], esi
+        mov     [r15+4], edi
+        mov     [r15+8], ebp
+        mov     [r15+12], r8d
+
+        //
+        // Update the chaining value to the last ciphertext block
+        //
+        mov     rax,[rsp + GET_MEMSLOT_OFFSET(slot2)]
+        mov     rcx,[rsp + GET_MEMSLOT_OFFSET(slot3)]
+        mov     [rbx], rax
+        mov     [rbx+8], rcx
+
+SymCryptAesCbcDecryptNoData:
+
+NESTED_FUNCTION_END(SymCryptAesCbcDecryptAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptAesCtrMsb64(
+//    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//    _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
+//    _In_reads_bytes_( cbData )                   PCBYTE                      pbSrc,
+//    _Out_writes_bytes_( cbData )                  PBYTE                       pbDst,
+//                                            SIZE_T                      cbData )
+
+NESTED_FUNCTION_START(SymCryptAesCtrMsb64Asm, 5, 15)
+
+        // Here we convert from whatever calling convention we are called from externally to our
+        // AES internal calling convention.
+        // We need to be careful that we don't overwrite an argument register before we copy or use
+        // the value appropriately for use in the AES functions.
+        // There is no automatic method for checking we do this correctly - modify with care!
+        // In SystemV and MSFT x64 ABIs, the possible 5 argument registers are:
+        //      rcx, rdx, r8, r9, r10, rdi, rsi
+
+        SYMCRYPT_CHECK_MAGIC SymCryptAesCtrMsb64AsmCheckMagic, Q1, magicFieldOffset, Q1
+
+        and     Q5, NOT 15      // only deal with whole # blocks
+        jz      SymCryptAesCtrMsb64NoData
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q2   // save pbChainingState
+        mov     rax, Q2         // rax = pbChainingValue
+        mov     r13, Q3         // r13 = pbSrc
+        mov     r14, Q5         // r14 = cbData
+        mov     r15, Q4         // r15 = pbDst
+        add     r14, Q3         // r14 = cbData + pbSrc = pbSrcEnd
+
+        mov     r10,[Q1 + lastEncRoundKeyOffset]        // r10 = last enc round key
+        mov     r12,Q1                                  // r12 = first round key to use
+
+
+        lea     r11,[GET_SYMBOL_ADDRESS(SymCryptAesSboxMatrixMult)]
+
+        //
+        // Load the chaining state
+        //
+        mov     rcx, [rax + 8]
+        mov     rax, [rax    ]
+
+        //
+        // Store it in our local copy (we have no register free to keep pbChainingState in)
+        //
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], rcx
+
+        //
+        // Move to the right registers
+        //
+        mov     rbx, rax
+        mov     rdx, rcx
+        shr     rbx, 32
+        shr     rdx, 32
+
+ALIGN(16)
+SymCryptAesCtrMsb64AsmLoop:
+        // Loop invariant
+        // Current chaining state is in (eax, ebx, ecx, edx)
+        // r10 = last round key to use
+        // r11 = SboxMatrixMult
+        // r12 = first round key to use
+        // r13 = pbSrc
+        // r14 = pbSrcEnd
+        // r15 = pbDst
+        // [slot1..slot2] = 16 bytes chaining state block
+
+        mov     r9, r12
+
+        AES_ENCRYPT SymCryptAesCtrMsb64AsmInnerLoop
+        //
+        // Plaintext in eax, ebx, ecx, edx
+        // r9 points to first round key to use
+        // r10 is last key to use (unchanged)
+        // r11 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in esi, edi, ebp, r8d
+        //
+
+        // To improve latency, we FIRST
+        // load the chaining state, increment the counter, and write it back.
+        // leave the state in the (eax, ebx, ecx, edx) registers
+
+        mov     eax,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 0]
+        mov     ebx,dword ptr [rsp + GET_MEMSLOT_OFFSET(slot1) + 4]
+        mov     rcx,[rsp +  GET_MEMSLOT_OFFSET(slot2) ]
+        bswap   rcx
+        add     rcx, 1
+        bswap   rcx
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2) ], rcx
+        mov     rdx, rcx
+        shr     rdx, 32
+
+        // THEN we process the XOR of the key stream with the data
+        // This order is faster as we need to have the chaining state done
+        // before we can proceed, but there are no dependencies on the data result
+        // So we can loop back to the beginning while the data stream read/writes are
+        // still in flight.
+        //
+        // xor with the source stream
+
+        xor     esi,[r13 + 0 ]
+        xor     edi,[r13 + 4 ]
+        xor     ebp,[r13 + 8 ]
+        xor     r8d,[r13 + 12]
+
+        // store at the destination
+
+        mov     [r15 + 0], esi
+        mov     [r15 + 4], edi
+        mov     [r15 + 8], ebp
+        mov     [r15 + 12], r8d
+
+        add     r13, 16     // pbSrc += 16
+        add     r15, 16     // pbDst += 16
+
+        cmp     r13, r14
+
+        jb      SymCryptAesCtrMsb64AsmLoop
+
+        //
+        // Copy back the chaining value - we only modified the last 8 bytes, so that is all we copy
+        //
+        mov     rsi,[rsp + GET_MEMSLOT_OFFSET(slot0)]   // pbChainingState
+        mov     [rsi + 8], ecx
+        mov     [rsi + 12], edx
+
+        //
+        // Wipe the chaining value on stack
+        //
+        xor     rax, rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], rax
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], rax
+
+SymCryptAesCtrMsb64NoData:
+
+NESTED_FUNCTION_END(SymCryptAesCtrMsb64Asm)
+
+FILE_END()
diff --git a/lib/amd64/fdef369_asm.asm b/lib/amd64/fdef369_asm.asm
deleted file mode 100644
index f13d995..0000000
--- a/lib/amd64/fdef369_asm.asm
+++ /dev/null
@@ -1,529 +0,0 @@
-;
-;  fdef_369asm.asm   Assembler code for large integer arithmetic in the default data format
-;
-; This file contains alternative routines that are used for modular computations
-; where the modulus is 257-384 or 513-576 bits long.
-; (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
-;
-; The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
-;
-; Most of this code is a direct copy of the default code.
-; AMD64 digits are now 512 bits.
-; We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
-; are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-include ksamd64.inc
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-
-
-include C_asm_shared.inc
-
-; A digit consists of 4 words of 64 bits each
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdef369RawAddAsm(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
-;                                                            UINT32      nDigits );
-
-        LEAF_ENTRY SymCryptFdef369RawAddAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
-        
-        add     r9, 1
-        xor     rax, rax
-        xor     r10, r10
-
-        ; Cy = 0
-
-SymCryptFdef369RawAddAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        adc     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        adc     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-        
-        mov     rax,[rcx + 16]
-        adc     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-        
-        lea     rcx, [rcx + 24]
-        lea     rdx, [rdx + 24]
-        lea     r8,  [r8  + 24]
-        dec     r9d
-        jnz     SymCryptFdef369RawAddAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-                
-        ret
-        
-        LEAF_END SymCryptFdef369RawAddAsm, _TEXT
-
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdefRawSubAsm(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-;                                                            UINT32      nDigits );
-
-        LEAF_ENTRY SymCryptFdef369RawSubAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
- 
-        add     r9, 1
-        xor     rax, rax
-        xor     r10, r10
-
-SymCryptFdef369RawSubAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        sbb     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        sbb     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-        
-        mov     rax,[rcx + 16]
-        sbb     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-        
-        lea     rcx, [rcx + 24]
-        lea     rdx, [rdx + 24]
-        lea     r8,  [r8  + 24]
-        dec     r9d
-        jnz     SymCryptFdef369RawSubAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-                
-        ret
-        
-        LEAF_END SymCryptFdef369RawSubAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefMaskedCopy(
-;    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
-;    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask )
-
-    LEAF_ENTRY  SymCryptFdef369MaskedCopyAsm, _TEXT
-
-        add     r8d, 1
-        movsxd  r9, r9d     
-
-SymCryptFdef369MaskedCopyAsmLoop:
-        mov     rax, [rcx]
-        mov     r10, [rdx]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx], rax
-
-        mov     rax, [rcx + 8]
-        mov     r10, [rdx + 8]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx + 8], rax
-
-        mov     rax, [rcx + 16]
-        mov     r10, [rdx + 16]
-        xor     rax, r10
-        and     rax, r9
-        xor     rax, r10
-        mov     [rdx + 16], rax
-
-        ; Move on to the next digit
-
-        add     rcx, 24
-        add     rdx, 24
-        sub     r8d, 1
-        jnz     SymCryptFdef369MaskedCopyAsmLoop
-        ret
-
-        LEAF_END SymCryptFdef369MaskedCopyAsm, _TEXT
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-
-SymCryptFdef369RawMulAsm_Frame struct
-        SavedRbx        dq  ?
-        SavedRdi        dq  ?
-        SavedRsi        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        returnaddress   dq  ?
-        Arg1Home        dq  ?
-        Arg2Home        dq  ?
-        Arg3Home        dq  ?
-        Arg4Home        dq  ?
-        pDst            dq  ?
-
-SymCryptFdef369RawMulAsm_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdef369RawMulAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        rsi
-        push_reg        rdi
-
-        END_PROLOGUE
-
-        ; Basic structure:
-        ;   for each word in Src1:
-        ;       Dst += Src2 * word
-        ; Register assignments
-        ; 
-        ; rax = tmp for mul
-        ; rbx = word from Src1 to multiply with
-        ; rcx = pSrc1  (updated in outer loop)
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc2
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pSrc2
-        ; r9 = nDigits2
-        ; r10 = pDst (incremented in outer loop)
-        ; r11 = # words left from Src1 to process
-        ; r12 = carry
-        ; r13 = inner loop counter
-
-
-        add     edx, 1
-        add     r9d, 1
-        lea     r11d, [edx + 2*edx]  ; nDigits1 * 3 = # words in Src1 to process
-        mov     r10, [rsp + SymCryptFdef369RawMulAsm_Frame.pDst ]
-
-        ; Outer loop invariant established: rcx, r8, r9, r10
-
-
-        mov     rsi, r8             ; rsi = pSrc2
-        mov     rdi, r10            ; rdi = pDst + outer loop ctr
-        mov     rbx, [rcx]          ; mulword
-        xor     r12, r12
-        mov     r13d, r9d
-
-        ; First inner loop overwrites Dst, which avoids adding the current Dst value
-
-SymCryptFdef369RawMulAsmLoop1:
-        mov     rax, [rsi]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 8]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16], rax
-        mov     r12, rdx
-
-        add     rsi, 24
-        add     rdi, 24
-        sub     r13d,1
-        jnz     SymCryptFdef369RawMulAsmLoop1
-
-        mov     [rdi], rdx              ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11d, 1
-
-SymCryptFdef369RawMulAsmLoopOuter:
-
-        add     rcx, 8                  ; move to next word of pSrc1
-        add     r10, 8                  ; move Dst pointer one word over
-        mov     rbx, [rcx]
-        mov     rsi, r8
-        mov     rdi, r10
-        xor     r12, r12
-        mov     r13d, r9d
-
-SymCryptFdef369RawMulAsmLoop2:
-        mov     rax, [rsi]
-        mul     rbx
-        add     rax, [rdi]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 8]
-        mul     rbx
-        add     rax, [rdi + 8]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     rax, [rdi + 16]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16], rax
-        mov     r12, rdx
-
-        add     rsi, 24
-        add     rdi, 24
-        sub     r13d,1
-        jnz     SymCryptFdef369RawMulAsmLoop2
-
-        mov     [rdi], rdx          ; write next word. (stays within Dst buffer)
-
-        sub     r11d, 1
-        jnz     SymCryptFdef369RawMulAsmLoopOuter
-
-        BEGIN_EPILOGUE
-
-        pop     rdi
-        pop     rsi
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-               
-    NESTED_END      SymCryptFdef369RawMulAsm, _TEXT
-
-
-
-
-
-
-;VOID
-;SymCryptFdefMontgomeryReduceAsm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdef369MontgomeryReduceAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        mov     r11, rdx        ; r11 = pSrc
-        mov     ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64]                  ; nDigits
-        add     ebp, 1
-        mov     r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        lea     edi, [ebp + 2*ebp]  ; outer loop counter, in words
-
-        xor     r14d, r14d
-
-        ; General register allocations
-        ; rax = multiply result
-        ; rbx = multiplier in inner loop
-        ; rcx = pointer to modulus value
-        ; rdx = multiply result
-        ; rsi = loop counter
-        ; rdi = loop counter
-        ; rbp = nDigits
-        ; r8 = pDst
-        ; r9 = running pointer in Src
-        ; r10 = running pointer in Mod
-        ; r11 = pSrc (updated in outer loop)
-        ; r12 = carry
-        ; r13 = pmMod->tm.montgomery.inv64
-        ; r14 = carry out from last word of previous loop iteration
-
-
-SymCryptFdef369MontgomeryReduceAsmOuterLoop:
-
-        ; start decoder with a few simple instructions, including at least one that requires
-        ; a uop execution and is on the critical path
-
-        mov     rbx, [r11]                      ; fetch word of Src we want to set to zero
-        mov     r10, r11
-        mov     r9, rcx
-
-        imul    rbx, r13                        ; lower word is same for signed & unsigned multiply
-
-        mov     esi, ebp
-        xor     r12d, r12d
-
-SymCryptFdef369MontgomeryReduceAsmInnerloop:
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rcx = pointer to modulus value
-        ; rdx = mul scratch
-        ; edi = outer loop counter (words)
-        ; esi = inner loop counter (digits)
-        ; r9  = running ptr to modulus
-        ; r10 = running ptr to input/scratch
-        ; r12 = carry (64 bits)
-
-        mov     rax, [r9]
-        mul     rbx
-        add     rax, [r10]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10], rax
-        mov     r12, rdx
-
-        mov     rax, [r9 + 8]
-        mul     rbx
-        add     rax, [r10 + 8]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10 + 8], rax
-        mov     r12, rdx
-
-        mov     rax, [r9 + 16]
-        mul     rbx
-        add     rax, [r10 + 16]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [r10 + 16], rax
-        mov     r12, rdx
-
-        add     r9, 24
-        add     r10, 24
-        sub     esi,1
-        jnz     SymCryptFdef369MontgomeryReduceAsmInnerloop
-
-        add     r12, r14
-        mov     r14d, 0
-        adc     r14, 0
-        add     r12, [r10]
-        adc     r14, 0
-        mov     [r10], r12
-
-        add     r11, 8
-
-        sub     edi, 1
-        jnz     SymCryptFdef369MontgomeryReduceAsmOuterLoop
-
-        ;
-        ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
-        ; 
-
-        ; First we compute the pSrc result minus the modulus into the destination
-        mov     esi, ebp        ; loop ctr
-        mov     r10, r11        ; pSrc
-        mov     r9, rcx         ; pMod
-        mov     r12, r8         ; pDst
-
-        ; Cy = 0 because the last 'sub edi,1' resulted in 0
-
-SymCryptFdef369MontgomeryReduceAsmSubLoop:
-        mov     rax,[r10]
-        sbb     rax,[r9]
-        mov     [r12], rax
-
-        mov     rax,[r10 + 8]
-        sbb     rax,[r9 + 8]
-        mov     [r12 + 8], rax
-
-        mov     rax,[r10 + 16]
-        sbb     rax,[r9 + 16]
-        mov     [r12 + 16], rax
-
-        lea     r10,[r10+24]
-        lea     r9, [r9 +24]
-        lea     r12,[r12+24]
-        dec     esi
-        jnz     SymCryptFdef369MontgomeryReduceAsmSubLoop
-
-        ; Finally a masked copy form pSrc to pDst 
-        ; copy if: r14 == 0 && Cy = 1
-        sbb     r14, 0              ; mask (64 bits)
-
-
-SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
-        mov     rax, [r11]
-        mov     rsi, [r8]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8], rax
-
-        mov     rax, [r11 + 8]
-        mov     rsi, [r8 + 8]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8 + 8], rax
-
-        mov     rax, [r11 + 16]
-        mov     rsi, [r8 + 16]
-        xor     rax, rsi
-        and     rax, r14
-        xor     rax, rsi
-        mov     [r8 + 16], rax
-
-        ; Move on to the next digit
-
-        add     r11, 24
-        add     r8, 24
-        sub     ebp, 1
-        jnz     SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rdi
-        pop     rsi
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-               
-    NESTED_END      SymCryptFdef369MontgomeryReduceAsm, _TEXT
-
-        end
-
diff --git a/lib/amd64/fdef369_asm.symcryptasm b/lib/amd64/fdef369_asm.symcryptasm
new file mode 100644
index 0000000..61ae581
--- /dev/null
+++ b/lib/amd64/fdef369_asm.symcryptasm
@@ -0,0 +1,451 @@
+//
+//  fdef_369asm.asm   Assembler code for large integer arithmetic in the default data format
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// This file contains alternative routines that are used for modular computations
+// where the modulus is 257-384 or 513-576 bits long.
+// (Currently on ARM64 it is also used for 0-192-bit moduli but not on AMD64)
+//
+// The immediate advantage is that it improves EC performance on 384, and 521-bit curves.
+//
+// Most of this code is a direct copy of the default code.
+// AMD64 digits are now 512 bits.
+// We read the 'ndigit' value. If it is 1 digit, the values are 6 64-bit words, if it is 2 the values
+// are 9 64-bit words. As we compute in groups of 3, our loop counters are one more than nDigit
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+#include "symcryptasm_shared.cppasm"
+
+// A digit consists of 4 words of 64 bits each
+
+//UINT32
+//SYMCRYPT_CALL
+// SymCryptFdef369RawAddAsm(
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//     _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                             UINT32      nDigits )
+FUNCTION_START(SymCryptFdef369RawAddAsm, 4, 5)
+
+        inc     D4
+        xor     Q0, Q0
+
+SymCryptFdef369RawAddAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        adc     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        adc     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        adc     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        lea     Q1, [Q1 + 24]
+        lea     Q2, [Q2 + 24]
+        lea     Q3, [Q3 + 24]
+        dec     D4
+        jnz     SymCryptFdef369RawAddAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdef369RawAddAsm)
+
+// UINT32
+// SYMCRYPT_CALL
+// SymCryptFdef369RawSubAsm(
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//     _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//     _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                             UINT32      nDigits )
+
+FUNCTION_START(SymCryptFdef369RawSubAsm, 4, 5)
+
+        inc     D4
+        xor     Q0, Q0
+
+SymCryptFdef369RawSubAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        sbb     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        sbb     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        sbb     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        lea     Q1, [Q1 + 24]
+        lea     Q2, [Q2 + 24]
+        lea     Q3, [Q3 + 24]
+        dec     D4
+        jnz     SymCryptFdef369RawSubAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdef369RawSubAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369MaskedCopyAsm(
+//     _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
+//     _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
+//                                                                 UINT32      nDigits,
+//                                                                 UINT32      mask )
+
+FUNCTION_START(SymCryptFdef369MaskedCopyAsm, 4, 6)
+
+        inc     D3
+        movsxd  Q4, D4
+
+SymCryptFdef369MaskedCopyAsmLoop:
+        mov     Q0, [Q1]
+        mov     Q5, [Q2]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2], Q0
+
+        mov     Q0, [Q1 + 8]
+        mov     Q5, [Q2 + 8]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2 + 8], Q0
+
+        mov     Q0, [Q1 + 16]
+        mov     Q5, [Q2 + 16]
+        xor     Q0, Q5
+        and     Q0, Q4
+        xor     Q0, Q5
+        mov     [Q2 + 16], Q0
+
+        // Move on to the next digit
+
+        add     Q1, 24
+        add     Q2, 24
+        dec     D3
+        jnz     SymCryptFdef369MaskedCopyAsmLoop
+
+FUNCTION_END(SymCryptFdef369MaskedCopyAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369RawMulAsm(
+//     _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                     UINT32      nDigits1,
+//     _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                     UINT32      nDigits2,
+//     _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdef369RawMulAsm, 5, 11)
+
+        // Basic structure:
+        //   for each word in Src1:
+        //       Dst += Src2 * word
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = pSrc1  (updated in outer loop)
+        // D2 = # words left from Src1 to process
+        // Q3 = pSrc2
+        // Q4 = nDigits2
+        // Q5 = pDst (incremented in outer loop)
+        // Q6 = inner loop pointer into pSrc2
+        // Q7 = inner loop pointer into pDst
+        // Q8 = word from Src1 to multiply with
+        // Q9 = carry
+        // D10 = inner loop counter
+
+        inc     D2
+        inc     D4
+        lea     D2, [D2 + 2*D2]     // nDigits1 * 3 = # words in Src1 to process
+
+        // Outer loop invariant established: Q1, Q3, D4, Q5
+
+        mov     Q6, Q3              // Q6 = pSrc2
+        mov     Q7, Q5              // Q7 = pDst + outer loop ctr
+        mov     Q8, [Q1]            // mulword
+        xor     Q9, Q9
+        mov     D10, D4
+
+        // First inner loop overwrites Dst, which avoids adding the current Dst value
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoop1:
+        mov     Q0, [Q6]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 8]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 8], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 16]
+        mul     Q8
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 16], Q0
+        mov     Q9, QH
+
+        add     Q6, 24
+        add     Q7, 24
+        dec     D10
+        jnz     SymCryptFdef369RawMulAsmLoop1
+
+        mov     [Q7], QH                // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     D2
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoopOuter:
+
+        add     Q1, 8                   // move to next word of pSrc1
+        add     Q5, 8                   // move Dst pointer one word over
+        mov     Q8, [Q1]
+        mov     Q6, Q3
+        mov     Q7, Q5
+        xor     Q9, Q9
+        mov     D10, D4
+
+ALIGN(16)
+
+SymCryptFdef369RawMulAsmLoop2:
+        mov     Q0, [Q6]
+        mul     Q8
+        add     Q0, [Q7]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 8]
+        mul     Q8
+        add     Q0, [Q7 + 8]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 8], Q0
+        mov     Q9, QH
+
+        mov     Q0, [Q6 + 16]
+        mul     Q8
+        add     Q0, [Q7 + 16]
+        adc     QH, 0
+        add     Q0, Q9
+        adc     QH, 0
+        mov     [Q7 + 16], Q0
+        mov     Q9, QH
+
+        add     Q6, 24
+        add     Q7, 24
+        dec     D10
+        jnz     SymCryptFdef369RawMulAsmLoop2
+
+        mov     [Q7], QH           // write next word. (stays within Dst buffer)
+
+        dec     D2
+        jnz     SymCryptFdef369RawMulAsmLoopOuter
+
+MUL_FUNCTION_END(SymCryptFdef369RawMulAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdef369MontgomeryReduceAsm(
+//     _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//     _Inout_                         PUINT32                 pSrc,
+//     _Out_                           PUINT32                 pDst )
+
+MUL_FUNCTION_START(SymCryptFdef369MontgomeryReduceAsm, 3, 13)
+
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]                   // nDigits
+        inc     D4
+        mov     Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]          // inv64
+
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                     // modulus value
+
+        lea     D12, [D4 + 2*D4]  // outer loop counter, in words
+
+        xor     D8, D8
+
+        // General register allocations
+        // Q0 = multiply result
+        // QH = multiply result
+        // Q1 = pointer to modulus value
+        // Q2 = pSrc (updated in outer loop)
+        // Q3 = pDst
+        // D4 = nDigits
+        // Q5 = pmMod->tm.montgomery.inv64
+        // Q6 = multiplier in inner loop
+        // Q7 = carry
+        // Q8 = carry out from last word of previous loop iteration
+        // Q9 = running pointer in Src
+        // Q10 = running pointer in Mod
+        // D11 = loop counter
+        // D12 = outer loop counter (words)
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmOuterLoop:
+
+        // start decoder with a few simple instructions, including at least one that requires
+        // a uop execution and is on the critical path
+
+        mov     Q6, [Q2]                      // fetch word of Src we want to set to zero
+        mov     Q10, Q2
+        mov     Q9, Q1
+
+        imul    Q6, Q5                        // lower word is same for signed & unsigned multiply
+
+        mov     D11, D4
+        xor     D7, D7
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmInnerloop:
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // Q1 = pointer to modulus value
+        // Q6 = multiplier
+        // Q7 = carry (64 bits)
+        // Q9  = running ptr to modulus
+        // Q10 = running ptr to input/scratch
+        // D11 = inner loop counter (digits)
+        // D12 = outer loop counter (words)
+
+        mov     Q0, [Q9]
+        mul     Q6
+        add     Q0, [Q10]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10], Q0
+        mov     Q7, QH
+
+        mov     Q0, [Q9 + 8]
+        mul     Q6
+        add     Q0, [Q10 + 8]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10 + 8], Q0
+        mov     Q7, QH
+
+        mov     Q0, [Q9 + 16]
+        mul     Q6
+        add     Q0, [Q10 + 16]
+        adc     QH, 0
+        add     Q0, Q7
+        adc     QH, 0
+        mov     [Q10 + 16], Q0
+        mov     Q7, QH
+
+        add     Q9, 24
+        add     Q10, 24
+        dec     D11
+        jnz     SymCryptFdef369MontgomeryReduceAsmInnerloop
+
+        add     Q7, Q8
+        mov     D8, 0
+        adc     Q8, 0
+        add     Q7, [Q10]
+        adc     Q8, 0
+        mov     [Q10], Q7
+
+        add     Q2, 8
+
+        dec     D12
+        jnz     SymCryptFdef369MontgomeryReduceAsmOuterLoop
+
+        //
+        // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
+        //
+
+        // First we compute the pSrc result minus the modulus into the destination
+        mov     D11, D4         // loop ctr
+        mov     Q10, Q2         // pSrc
+        mov     Q9, Q1          // pMod
+        mov     Q7, Q3          // pDst
+
+        // Cy = 0 because the last 'adc Q8,0' resulted in 0, 1, or 2
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmSubLoop:
+        mov     Q0,[Q10]
+        sbb     Q0,[Q9]
+        mov     [Q7], Q0
+
+        mov     Q0,[Q10 + 8]
+        sbb     Q0,[Q9 + 8]
+        mov     [Q7 + 8], Q0
+
+        mov     Q0,[Q10 + 16]
+        sbb     Q0,[Q9 + 16]
+        mov     [Q7 + 16], Q0
+
+        lea     Q10,[Q10 + 24]
+        lea     Q9,[Q9 + 24]
+        lea     Q7,[Q7 + 24]
+
+        dec     D11
+        jnz     SymCryptFdef369MontgomeryReduceAsmSubLoop
+
+        // Finally a masked copy form pSrc to pDst
+        // copy if: Q8 == 0 && Cy = 1
+        sbb     Q8, 0              // mask (64 bits)
+
+ALIGN(16)
+
+SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop:
+        mov     Q0, [Q2]
+        mov     Q1, [Q3]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3], Q0
+
+        mov     Q0, [Q2 + 8]
+        mov     Q1, [Q3 + 8]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3 + 8], Q0
+
+        mov     Q0, [Q2 + 16]
+        mov     Q1, [Q3 + 16]
+        xor     Q0, Q1
+        and     Q0, Q8
+        xor     Q0, Q1
+        mov     [Q3 + 16], Q0
+
+        // Move on to the next digit
+
+        add     Q2, 24
+        add     Q3, 24
+        dec     D4
+        jnz     SymCryptFdef369MontgomeryReduceAsmMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdef369MontgomeryReduceAsm)
+
+FILE_END()
diff --git a/lib/amd64/fdef_asm.asm b/lib/amd64/fdef_asm.asm
deleted file mode 100644
index 8f53bf7..0000000
--- a/lib/amd64/fdef_asm.asm
+++ /dev/null
@@ -1,2165 +0,0 @@
-;
-;  fdef_asm.asm   Assembler code for large integer arithmetic in the default data format
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-include ksamd64.inc
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-
-
-include C_asm_shared.inc
-
-include fdef_mul_macros.asm
-
-        altentry SymCryptFdefMontgomerReduce256AsmInternal
-
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdefRawAdd(
-;    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-;    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask );
-
-        LEAF_ENTRY SymCryptFdefRawAddAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
-
-        add     r9d, r9d        ; loop over each half digit
-        xor     rax, rax
-        xor     r10, r10
-
-SymCryptFdefRawAddAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        adc     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        adc     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-
-        mov     rax,[rcx + 16]
-        adc     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-
-        mov     rax,[rcx + 24]
-        adc     rax,[rdx + 24]
-        mov     [r8 + 24], rax
-
-        lea     rcx, [rcx + 32]
-        lea     rdx, [rdx + 32]
-        lea     r8,  [r8  + 32]
-        dec     r9d
-        jnz     SymCryptFdefRawAddAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-
-        ret
-
-        LEAF_END SymCryptFdefRawAddAsm, _TEXT
-
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdefRawSub(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
-;                                                            UINT32      nDigits )
-
-        LEAF_ENTRY SymCryptFdefRawSubAsm, _TEXT
-
-        ; rcx = Src1
-        ; rdx = Src2
-        ; r8 = Dst
-        ; r9 = nDigits
-
-        add     r9d, r9d        ; loop over each half digit
-        xor     rax, rax
-        xor     r10, r10
-
-SymCryptFdefRawSubAsmLoop:
-        ; carry is in the carry flag
-        mov     rax,[rcx]
-        sbb     rax,[rdx]
-        mov     [r8],rax
-
-        mov     rax,[rcx + 8]
-        sbb     rax,[rdx + 8]
-        mov     [r8 + 8], rax
-
-        mov     rax,[rcx + 16]
-        sbb     rax,[rdx + 16]
-        mov     [r8 + 16], rax
-
-        mov     rax,[rcx + 24]
-        sbb     rax,[rdx + 24]
-        mov     [r8 + 24], rax
-
-        lea     rcx, [rcx + 32]
-        lea     rdx, [rdx + 32]
-        lea     r8,  [r8  + 32]
-        dec     r9d
-        jnz     SymCryptFdefRawSubAsmLoop
-
-        mov     rax, r10
-        adc     rax, r10
-
-        ret
-
-        LEAF_END SymCryptFdefRawSubAsm, _TEXT
-
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefMaskedCopy(
-;    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
-;    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask )
-
-    LEAF_ENTRY  SymCryptFdefMaskedCopyAsm, _TEXT
-
-        add     r8d, r8d            ; loop over half digits
-
-        movd    xmm0, r9d           ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-SymCryptFdefMaskedCopyAsmLoop:
-        movdqa  xmm2, [rcx]         ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rdx]         ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [rdx], xmm2
-
-        movdqa  xmm2, [rcx + 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rdx + 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [rdx + 16], xmm2
-
-        ; Move on to the next digit
-
-        add     rcx, 32
-        add     rdx, 32
-        sub     r8d, 1
-        jnz     SymCryptFdefMaskedCopyAsmLoop
-        ret
-
-        LEAF_END SymCryptFdefMaskedCopyAsm, _TEXT
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-
-SymCryptFdefRawMulAsm_Frame struct
-        SavedRdi        dq  ?
-        SavedRsi        dq  ?
-        SavedR15        dq  ?
-        SavedR14        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        SavedRbx        dq  ?
-        returnaddress   dq  ?
-        Arg1Home        dq  ?
-        Arg2Home        dq  ?
-        Arg3Home        dq  ?
-        Arg4Home        dq  ?
-        pDst            dq  ?
-
-SymCryptFdefRawMulAsm_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdefRawMulAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rsi
-        push_reg        rdi
-
-        END_PROLOGUE
-
-        ; Basic structure:
-        ;   for each word in Src1:
-        ;       Dst += Src2 * word
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src1 to multiply with
-        ; rcx = pSrc1  (updated in outer loop)
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc2
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pSrc2
-        ; r9 = nDigits2
-        ; r10 = pDst (incremented in outer loop)
-        ; r11 = # words left from Src1 to process
-        ; r12 = carry for even words (64 bits)
-        ; r13 = inner loop counter
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r11, rdx            ; nDigits1
-        shl     r11, 3              ; nDigits1 * 8 = # words in Src1 to process
-        mov     r10, [rsp + SymCryptFdefRawMulAsm_Frame.pDst ]
-
-        ; Outer loop invariant established: rcx, r8, r9, r10
-
-
-        mov     rsi, r8             ; rsi = pSrc2
-        mov     rdi, r10            ; rdi = pDst + outer loop ctr
-        mov     rbx, [rcx]          ; mulword
-        xor     r12, r12
-        mov     r13, r9
-
-        ; First inner loop overwrites Dst, which avoids adding the current Dst value
-
-        ALIGN   16
-
-SymCryptFdefRawMulAsmLoop1:
-        MULT_SINGLEADD_128 0, rsi, rdi
-        MULT_SINGLEADD_128 2, rsi, rdi
-        MULT_SINGLEADD_128 4, rsi, rdi
-        MULT_SINGLEADD_128 6, rsi, rdi
-
-        lea     rsi,[rsi + 64]
-        lea     rdi,[rdi + 64]
-
-        dec     r13
-        jnz     SymCryptFdefRawMulAsmLoop1
-
-        mov     [rdi], r12              ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1
-
-        ALIGN   16
-
-SymCryptFdefRawMulAsmLoopOuter:
-
-        add     rcx, 8                  ; move to next word of pSrc1
-        add     r10, 8                  ; move Dst pointer one word over
-        mov     rbx, [rcx]
-        mov     rsi, r8
-        mov     rdi, r10
-        xor     r12, r12
-        mov     r13, r9
-
-        ALIGN   16
-
-SymCryptFdefRawMulAsmLoop2:
-        MULT_DOUBLEADD_128 0, rsi, rdi
-        MULT_DOUBLEADD_128 2, rsi, rdi
-        MULT_DOUBLEADD_128 4, rsi, rdi
-        MULT_DOUBLEADD_128 6, rsi, rdi
-
-        lea     rsi,[rsi + 64]
-        lea     rdi,[rdi + 64]
-
-        dec     r13
-        jnz     SymCryptFdefRawMulAsmLoop2
-
-        mov     [rdi], r12          ; write next word. (stays within Dst buffer)
-
-        sub     r11, 1
-        jnz     SymCryptFdefRawMulAsmLoopOuter
-
-        BEGIN_EPILOGUE
-
-        pop     rdi
-        pop     rsi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-
-    NESTED_END      SymCryptFdefRawMulAsm, _TEXT
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquareAsm(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-
-SymCryptFdefRawSquareAsm_Frame struct
-
-        SavedRcx        dq  ?
-        SavedRdi        dq  ?
-        SavedRsi        dq  ?
-        SavedR15        dq  ?
-        SavedR14        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        SavedRbx        dq  ?
-        returnaddress   dq  ?
-        Arg1Home        dq  ?
-        Arg2Home        dq  ?
-        Arg3Home        dq  ?
-
-SymCryptFdefRawSquareAsm_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdefRawSquareAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        rcx
-
-        END_PROLOGUE
-
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src to multiply with
-        ; rcx = outer loop pointer into pSrc
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pDst (constant)
-        ; r9 = nDigits (constant)
-        ; r10 = outer loop pointer into pDst
-        ; r11 = outer loop counter of #words left
-        ; r12 = carry for even words (64 bits)
-        ; r13 = inner loop counter of #words left
-        ; r14 = cyclic counter that specifies on which branch we jump into
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r9,  rdx            ; nDigits
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; First Pass - Addition of the cross products x_i*x_j with i!=j
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ;
-        ; At the beginning of each inner loop we will jump over the
-        ; words that don't need processing. The decision of the jump
-        ; will be based on the cyclic counter r14.
-        ;
-        ; For the first pass we loop over **half** digits since having a smaller
-        ; number of jumps (i.e. 4) is actually faster than having 8 jumps.
-        ;
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     r11, rdx            ; nDigits
-        shl     r11, 3              ; r11 = outer #words
-        mov     r10, r8             ; r10 = outer pDst
-
-        mov     rsi, rcx            ; rsi = inner pSrc
-        mov     rdi, r10            ; rdi = inner pDst
-
-        ; Initial inner loop overwrites Dst, which avoids adding the current Dst value
-
-        mov     rbx, [rcx]          ; mulword
-
-        xor     r12, r12            ; carry = 0
-        xor     r15, r15            ; carry = 0
-
-        mov     r13, r11            ; r13 = inner #words
-        mov     [rdi], r12          ; Write 0 in the first word
-
-        ; Skip over the first word
-        jmp     SymCryptFdefRawSquareAsmInnerLoopInit_Word1
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoopInit_Word0:
-        SQR_SINGLEADD_64 0, rsi, rdi, r12, r15
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoopInit_Word1:
-        SQR_SINGLEADD_64 1, rsi, rdi, r15, r12
-
-        SQR_SINGLEADD_64 2, rsi, rdi, r12, r15
-
-        SQR_SINGLEADD_64 3, rsi, rdi, r15, r12
-
-        lea     rsi, [rsi + 32]
-        lea     rdi, [rdi + 32]
-        sub     r13, 4
-        jnz     SymCryptFdefRawSquareAsmInnerLoopInit_Word0
-
-        mov     [rdi], r12              ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1                  ; Counter for the outer loop
-        mov     r14, 1                  ; Cyclic counter r14 = 1
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmLoopOuter:
-
-        add     r10, 8                  ; move Dst pointer 1 word over
-
-        mov     rsi, rcx                ; rsi = inner pSrc
-        mov     rdi, r10                ; rdi = inner pDst
-
-        mov     rbx, [rcx + 8*r14]      ; Get the next mulword
-
-        inc     r14b                    ; Increment the cyclic counter by 1
-
-        mov     r13, r11                ; # of words for the inner loop
-        add     r13, 2
-        and     r13, 0FFFFFFFFFFFFFFFCh ; Zero out the 2 lower bits
-
-        xor     r12, r12                ; carry = 0
-        xor     r15, r15                ; carry = 0
-
-        ; Logic to find the correct jump
-        cmp     r14b, 3
-        je      SymCryptFdefRawSquareAsmInnerLoop_Word3
-        cmp     r14b, 2
-        je      SymCryptFdefRawSquareAsmInnerLoop_Word2
-        cmp     r14b, 1
-        je      SymCryptFdefRawSquareAsmInnerLoop_Word1
-
-        ; The following instructions are only executed when r14b == 4
-        xor     r14b, r14b              ; Set it to 0 for the next iteration
-
-        add     rcx, 32                 ; move pSrc 4 words over
-        add     r10, 32                 ; move destination 4 words over
-
-        mov     rsi, rcx                ; rsi = inner pSrc
-        mov     rdi, r10                ; rdi = inner pDst
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoop_Word0:
-        SQR_DOUBLEADD_64 0, rsi, rdi, r12, r15
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoop_Word1:
-        SQR_DOUBLEADD_64 1, rsi, rdi, r15, r12
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoop_Word2:
-        SQR_DOUBLEADD_64 2, rsi, rdi, r12, r15
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmInnerLoop_Word3:
-        SQR_DOUBLEADD_64 3, rsi, rdi, r15, r12
-
-        lea     rsi, [rsi + 32]
-        lea     rdi, [rdi + 32]
-        sub     r13, 4
-        jnz     SymCryptFdefRawSquareAsmInnerLoop_Word0
-
-        mov     [rdi], r12          ; write next word. (stays within Dst buffer)
-
-        dec     r11
-        cmp     r11, 1
-        jne     SymCryptFdefRawSquareAsmLoopOuter
-
-        xor     rdx, rdx
-        mov     [r10 + 40], rdx     ; Final word = 0
-
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Second Pass - Shifting all results 1 bit left
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     r11, r9             ; nDigits
-        mov     rdi, r8             ; pDst pointer
-        shl     r11, 1              ; 2*nDigits
-
-        ALIGN   16
-SymCryptFdefRawSquareAsmSecondPass:
-        SQR_SHIFT_LEFT 0
-        SQR_SHIFT_LEFT 1
-        SQR_SHIFT_LEFT 2
-        SQR_SHIFT_LEFT 3
-
-        SQR_SHIFT_LEFT 4
-        SQR_SHIFT_LEFT 5
-        SQR_SHIFT_LEFT 6
-        SQR_SHIFT_LEFT 7
-
-        lea     rdi, [rdi + 64]
-        dec     r11
-        jnz     SymCryptFdefRawSquareAsmSecondPass
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Third Pass - Adding the squares on the even columns and propagating the sum
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     rsi, [rsp + SymCryptFdefRawSquareAsm_Frame.SavedRcx]
-        mov     rdi, r8             ; rdi = pDst
-
-        xor     r12, r12
-
-SymCryptFdefRawSquareAsmThirdPass:
-        SQR_DIAGONAL_PROP 0
-        SQR_DIAGONAL_PROP 1
-        SQR_DIAGONAL_PROP 2
-        SQR_DIAGONAL_PROP 3
-        SQR_DIAGONAL_PROP 4
-        SQR_DIAGONAL_PROP 5
-        SQR_DIAGONAL_PROP 6
-        SQR_DIAGONAL_PROP 7
-
-        add     rsi, 64             ; One digit up
-        add     rdi, 128            ; Two digits up
-        sub     r9, 1
-        jnz     SymCryptFdefRawSquareAsmThirdPass
-
-        BEGIN_EPILOGUE
-
-        pop     rcx
-        pop     rdi
-        pop     rsi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-
-    NESTED_END      SymCryptFdefRawSquareAsm, _TEXT
-
-
-;VOID
-;SymCryptFdefMontgomeryReduceAsm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduceAsm, _TEXT
-
-        rex_push_reg    rbx
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        rbp
-
-        END_PROLOGUE
-
-        mov     r11, rdx        ; r11 = pSrc
-        mov     ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64]                  ; nDigits
-        mov     r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        mov     edi, ebp        ; outer loop counter
-        shl     edi, 3          ; edi is in words
-
-        xor     r14d, r14d
-
-        ; General register allocations
-        ; rax = multiply result
-        ; rbx = multiplier in inner loop
-        ; rcx = pointer to modulus value
-        ; rdx = multiply result
-        ; rsi = loop counter
-        ; rdi = loop counter
-        ; rbp = nDigits
-        ; r8 = pDst
-        ; r9 = running pointer in Src
-        ; r10 = running pointer in Mod
-        ; r11 = pSrc (updated in outer loop)
-        ; r12 = carry
-        ; r13 = pmMod->tm.montgomery.inv64
-        ; r14 = carry out from last word of previous loop iteration
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduceAsmOuterLoop:
-
-        ; start decoder with a few simple instructions, including at least one that requires
-        ; a uop execution and is on the critical path
-
-        mov     rbx, [r11]                      ; fetch word of Src we want to set to zero
-        mov     r10, r11
-        mov     r9, rcx
-
-        imul    rbx, r13                        ; lower word is same for signed & unsigned multiply
-
-        mov     esi, ebp
-        xor     r12d, r12d
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduceAsmInnerloop:
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rcx = pointer to modulus value
-        ; rdx = mul scratch
-        ; edi = outer loop counter (words)
-        ; esi = inner loop counter (digits)
-        ; r9  = running ptr to modulus
-        ; r10 = running ptr to input/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        MULT_DOUBLEADD_128 0, r9, r10
-        MULT_DOUBLEADD_128 2, r9, r10
-        MULT_DOUBLEADD_128 4, r9, r10
-        MULT_DOUBLEADD_128 6, r9, r10
-
-        lea     r9,[r9 + 64]
-        lea     r10,[r10 + 64]
-
-        dec     esi
-        jnz     SymCryptFdefMontgomeryReduceAsmInnerloop
-
-        add     r12, r14
-        mov     r14d, 0
-        adc     r14, 0
-        add     r12, [r10]
-        adc     r14, 0
-        mov     [r10], r12
-
-        lea     r11,[r11 + 8]
-
-        dec     edi
-        jnz     SymCryptFdefMontgomeryReduceAsmOuterLoop
-
-        ;
-        ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
-        ;
-
-        ; First we compute the pSrc result minus the modulus into the destination
-        mov     esi, ebp        ; loop ctr
-        mov     r10, r11        ; pSrc
-        mov     r9, rcx         ; pMod
-        mov     r12, r8         ; pDst
-
-        ; Cy = 0 because the last 'sub edi,1' resulted in 0
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduceAsmSubLoop:
-        mov     rax,[r10]
-        sbb     rax,[r9]
-        mov     [r12], rax
-
-        mov     rax,[r10 + 8]
-        sbb     rax,[r9 + 8]
-        mov     [r12 + 8], rax
-
-        mov     rax,[r10 + 16]
-        sbb     rax,[r9 + 16]
-        mov     [r12 + 16], rax
-
-        mov     rax,[r10 + 24]
-        sbb     rax,[r9 + 24]
-        mov     [r12 + 24], rax
-
-        mov     rax,[r10 + 32]
-        sbb     rax,[r9 + 32]
-        mov     [r12 + 32], rax
-
-        mov     rax,[r10 + 40]
-        sbb     rax,[r9 + 40]
-        mov     [r12 + 40], rax
-
-        mov     rax,[r10 + 48]
-        sbb     rax,[r9 + 48]
-        mov     [r12 + 48], rax
-
-        mov     rax,[r10 + 56]
-        sbb     rax,[r9 + 56]
-        mov     [r12 + 56], rax
-
-        lea     r10,[r10 + 64]
-        lea     r9,[r9 + 64]
-        lea     r12,[r12 + 64]
-
-        dec     esi
-        jnz     SymCryptFdefMontgomeryReduceAsmSubLoop
-
-        ; Finally a masked copy form pSrc to pDst
-        ; copy if: r14 == 0 && Cy = 1
-        sbb     r14d, 0
-
-        movd    xmm0, r14d          ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop:
-        movdqa  xmm2, [r11]         ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8]          ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8], xmm2
-
-        movdqa  xmm2, [r11 + 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 16], xmm2
-
-        movdqa  xmm2, [r11 + 32]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8 + 32]     ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8 + 32], xmm2
-
-        movdqa  xmm2, [r11 + 48]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 48]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 48], xmm2
-
-        ; Move on to the next digit
-        lea     r11,[r11 + 64]
-        lea     r8,[r8 + 64]
-
-        dec     ebp
-        jnz     SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rdi
-        pop     rsi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        pop     rbx
-        ret
-
-    NESTED_END      SymCryptFdefMontgomeryReduceAsm, _TEXT
-
-
-; --------------------------------
-; 256-bit size specific functions
-; --------------------------------
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefModAdd256(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
-;    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
-;    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-;    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-;                                    SIZE_T                  cbScratch );
-
-        NESTED_ENTRY    SymCryptFdefModAdd256Asm, _TEXT
-
-        push_reg    r12
-        push_reg    r13
-        push_reg    r14
-        push_reg    rbx
-
-        END_PROLOGUE
-
-        ; rcx = pmMod
-        ; rdx = peSrc1
-        ; r8  = peSrc2
-        ; r9  = peDst
-
-        ; compute Src1 + Src2 into (rax, rbx, r10, r11) with carry out mask in r12
-
-        mov     rax, [rdx]
-        add     rax, [r8 ]
-        mov     rbx, [rdx + 8]
-        adc     rbx, [r8  + 8]
-        mov     r10, [rdx + 16]
-        adc     r10, [r8  + 16]
-        mov     r11, [rdx + 24]
-        adc     r11, [r8  + 24]
-        sbb     r12, r12                  ; r12 = carry out mask
-
-        ; rdx, r8: free
-
-        ; Compute sum - Mod into (rdx, r8, r13, r14) = sum - modulus, rcx = carry out mask
-
-        add     rcx, SymCryptModulusValueOffsetAmd64
-
-        mov     rdx, rax
-        sub     rdx, [rcx]
-        mov     r8,  rbx
-        sbb     r8,  [rcx + 8]
-        mov     r13, r10
-        sbb     r13, [rcx + 16]
-        mov     r14, r11
-        sbb     r14, [rcx + 24]
-
-        sbb     rcx, rcx                 ; rcx = carry out mask
-
-        ; Choose between the two
-        ; addition carry = 1, then subtraction carry = 1 and we pick the 2nd result.
-        ; addition carry = 0 and subtraction carry = 0: pick 2nd result
-        ; addition carry = 0 and subtraction carry = 1: pick first result
-
-        xor     rcx, r12            ; 0 = 2nd result, 1 = first result
-
-        xor     rax, rdx
-        xor     rbx, r8
-        xor     r10, r13
-        xor     r11, r14
-
-        and     rax, rcx
-        and     rbx, rcx
-        and     r10, rcx
-        and     r11, rcx
-
-        xor     rdx, rax
-        xor     r8 , rbx
-        xor     r13, r10
-        xor     r14, r11
-
-        mov     [r9 +  0], rdx
-        mov     [r9 +  8], r8
-        mov     [r9 + 16], r13
-        mov     [r9 + 24], r14
-
-        BEGIN_EPILOGUE
-
-        pop     rbx
-        pop     r14
-        pop     r13
-        pop     r12
-        ret
-
-        NESTED_END      SymCryptFdefModAdd256Asm, _TEXT
-
-
-
-        NESTED_ENTRY    SymCryptFdefModSub256Asm, _TEXT
-
-        push_reg    r12
-        push_reg    r13
-        push_reg    rbx
-
-        END_PROLOGUE
-
-        ; rcx = pmMod
-        ; rdx = peSrc1
-        ; r8  = peSrc2
-        ; r9  = peDst
-
-        ; compute Src1 - Src2 into (rax, rbx, r10, r11) with carry out mask in r12
-
-        mov     rax, [rdx]
-        sub     rax, [r8 ]
-        mov     rbx, [rdx + 8]
-        sbb     rbx, [r8  + 8]
-        mov     r10, [rdx + 16]
-        sbb     r10, [r8  + 16]
-        mov     r11, [rdx + 24]
-        sbb     r11, [r8  + 24]
-        sbb     r12, r12                  ; r12 = carry out mask
-
-        ; rdx, r8: free
-
-        ; Load Mod into into (rdx, r8, r13, rcx)
-
-        add     rcx, SymCryptModulusValueOffsetAmd64
-
-        mov     rdx, [rcx]
-        mov     r8,  [rcx + 8]
-        mov     r13, [rcx + 16]
-        mov     rcx, [rcx + 24]
-
-        ; Mask the value to be added to zero if there was no underflow
-        and     rdx, r12
-        and     r8 , r12
-        and     r13, r12
-        and     rcx, r12
-
-        ; Add the (masked) modulus
-        add     rax, rdx
-        adc     rbx, r8
-        adc     r10, r13
-        adc     r11, rcx
-
-        mov     [r9 +  0], rax
-        mov     [r9 +  8], rbx
-        mov     [r9 + 16], r10
-        mov     [r9 + 24], r11
-
-        BEGIN_EPILOGUE
-
-        pop     rbx
-        pop     r13
-        pop     r12
-        ret
-
-        NESTED_END      SymCryptFdefModSub256Asm, _TEXT
-
-;=================================================
-; Multiplication
-;
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefModMulMontgomery256Asm(
-;    _In_                            PCSYMCRYPT_MODULUS      pMod,
-;    _In_                            PCSYMCRYPT_MODELEMENT   pSrc1,
-;    _In_                            PCSYMCRYPT_MODELEMENT   pSrc2,
-;    _Out_                           PSYMCRYPT_MODELEMENT    pDst,
-;    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-;                                    SIZE_T                  cbScratch );
-
-        NESTED_ENTRY    SymCryptFdefModMulMontgomery256Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE        ; saves all registers
-
-        mov     rsi, rdx            ; we need rdx for the multiplication
-
-        ; rcx = pMod
-        ; rsi = pSrc1
-        ; r8 = pSrc2
-        ; r9 = pDst
-
-        ; First we compute the product. The result will be in 8 registers
-        ;       rdi, rbp, r10, r11, r12, r13, r14, r15
-
-        mov     rbx, [rsi]
-        xor     r10, r10
-        xor     r11, r11
-        xor     r12, r12
-
-        mov     rax, [r8]
-        mul     rbx
-        mov     rdi, rax
-        mov     rbp, rdx
-
-        mov     rax, [r8 + 8]
-        mul     rbx
-        add     rbp, rax
-        adc     r10, rdx
-
-        mov     rax, [r8 + 16]
-        mul     rbx
-        add     r10, rax
-        adc     r11, rdx
-
-        mov     rax, [r8 + 24]
-        mul     rbx
-        add     r11, rax
-        adc     r12, rdx
-
-        ; Second row
-        mov     rbx, [rsi + 8]
-        MUL14   rbx, r8, rbp, r10, r11, r12, r15
-        mov     r13, rdx
-
-        ; third row
-        mov     rbx, [rsi + 16]
-        MUL14   rbx, r8, r10, r11, r12, r13, r15
-        mov     r14, rdx
-
-        ; fourth row
-        mov     rbx, [rsi + 24]
-        MUL14   rbx, r8, r11, r12, r13, r14, r15
-        mov     r15, rdx
-
-
-        ALTERNATE_ENTRY     SymCryptFdefMontgomerReduce256AsmInternal
-        ; Invariant:
-        ;   common prologue used
-        ;   512-bit result in (rdi, rbp, r10, r11, r12, r13, r14, r15)
-        ;   rcx = pmMod
-        ;   r9 = peDst
-
-        mov     r8, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]      ; inv64
-        add     rcx, SymCryptModulusValueOffsetAmd64
-
-        mov     rbx, rdi
-        imul    rbx, r8             ; lower word is the same for signed & unsigned multiply; rbx = multiplicand for first row
-        MUL14   rbx, rcx, rdi, rbp, r10, r11, rdi
-        mov     rdi, rdx            ; Save the out carries in (eventually) (rdi, rbp, r10, r11)
-
-        mov     rbx, rbp
-        imul    rbx, r8
-        MUL14   rbx, rcx, rbp, r10, r11, r12, rbp
-        mov     rbp, rdx            ; Save the out carries in (eventually) (rdi, rbp, r10, r11)
-
-        mov     rbx, r10
-        imul    rbx, r8
-        MUL14   rbx, rcx, r10, r11, r12, r13, r10
-        mov     r10, rdx
-
-        mov     rbx, r11
-        imul    rbx, r8
-        MUL14   rbx, rcx, r11, r12, r13, r14, r11
-        ; mov   r11, rdx
-
-        add     r12, rdi
-        adc     r13, rbp
-        adc     r14, r10
-        adc     r15, rdx
-
-        sbb     rbx, rbx        ; Carry out from final addition in mask form
-
-        ; reduced value in (r12, r13, r14, r15, -rbx), and it is less than 2*Modulus
-
-        mov     rdi, r12
-        sub     rdi, [rcx]
-        mov     rbp,  r13
-        sbb     rbp,  [rcx + 8]
-        mov     r10, r14
-        sbb     r10, [rcx + 16]
-        mov     r11, r15
-        sbb     r11, [rcx + 24]
-
-        sbb     rcx, rcx                 ; rcx = carry out mask
-
-        ; Choose between the two
-        ; addition carry = 1, then subtraction carry = 1 and we pick the 2nd result.
-        ; addition carry = 0 and subtraction carry = 0: pick 2nd result
-        ; addition carry = 0 and subtraction carry = 1: pick first result
-
-        xor     rcx, rbx            ; 0 = 2nd result, 1 = first result
-
-        xor     r12, rdi
-        xor     r13, rbp
-        xor     r14, r10
-        xor     r15, r11
-
-        and     r12, rcx
-        and     r13, rcx
-        and     r14, rcx
-        and     r15, rcx
-
-        xor     rdi, r12
-        xor     rbp, r13
-        xor     r10, r14
-        xor     r11, r15
-
-        mov     [r9 +  0], rdi
-        mov     [r9 +  8], rbp
-        mov     [r9 + 16], r10
-        mov     [r9 + 24], r11
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefModMulMontgomery256Asm, _TEXT
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefMontgomeryReduce256Asm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst );
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduce256Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE        ; saves all registers
-
-        mov     r9, r8
-        mov     rdi, [rdx +  0]
-        mov     rbp, [rdx +  8]
-        mov     r10, [rdx + 16]
-        mov     r11, [rdx + 24]
-        mov     r12, [rdx + 32]
-        mov     r13, [rdx + 40]
-        mov     r14, [rdx + 48]
-        mov     r15, [rdx + 56]
-
-
-        ; Normal code doesn't jump from the body of one function to the body of another function.
-        ; Here we have ensured that our stack frames are identical, so it is safe.
-        ; We just have to convince the other system components that this works...
-
-        ; Use conditional jump so that stack unwinder doesn't think it is an epilogue
-        test    rsp,rsp
-        jne     SymCryptFdefMontgomerReduce256AsmInternal       ; jumps always
-
-        int     3       ; Dummy instruction because the debugger seems to have an off-by-one
-                        ; error and still see the (wrong) epilogue when on the JNE instruction
-                        ; Best guess: the debugger starts the stack trace *after* the current instruction
-
-        ; And then we need a dummy epilogue to keep the assembler happy
-        BEGIN_EPILOGUE
-        ret
-
-        NESTED_END      SymCryptFdefMontgomeryReduce256Asm, _TEXT
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefModSquareMontgomery256(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PCSYMCRYPT_MODELEMENT   peSrc,
-;    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-;    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-;                                    SIZE_T                  cbScratch )
-
-        NESTED_ENTRY    SymCryptFdefModSquareMontgomery256Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE
-
-
-        ;  Result in   rdi, rbp, r10, r11, r12, r13, r14, r15
-
-        mov     rsi, rdx        ; free up rdx for multiplication
-        mov     r9, r8          ; need this later anyway
-
-        ; rcx = pmMod
-        ; rsi = Src
-        ; r9 = pDst
-
-        mov     rbx, [rsi]
-        xor     r11, r11
-        xor     r12, r12
-        xor     r13, r13
-        xor     r14, r14
-
-        ; First we compute all the terms that need doubling
-
-        mov     rax, [rsi + 8]
-        mul     rbx
-        mov     rbp, rax
-        mov     r10, rdx
-
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     r10, rax
-        adc     r11, rdx
-
-        mov     rax, [rsi + 24]
-        mul     rbx
-        add     r11, rax
-        adc     r12, rdx
-
-        mov     rbx, [rsi + 8]
-        mov     rax, [rsi + 16]
-        mul     rbx
-        add     r11, rax
-        adc     rdx, 0
-        mov     r15, rdx
-
-        mov     rax, [rsi + 24]
-        mul     rbx
-        add     r12, rax
-        adc     rdx, 0
-        add     r12, r15
-        adc     r13, rdx
-
-        mov     rbx, [rsi + 16]
-        mov     rax, [rsi + 24]
-        mul     rbx
-        add     r13, rax
-        adc     r14, rdx        ; no overflow from this
-
-        ; double these terms
-        xor     r15, r15
-
-        add     rbp, rbp
-        adc     r10, r10
-        adc     r11, r11
-        adc     r12, r12
-        adc     r13, r13
-        adc     r14, r14
-        adc     r15, 0
-
-        mov     rax, [rsi]
-        mul     rax
-        mov     rdi, rax
-        mov     rbx, rdx
-
-        mov     rax, [rsi + 8]
-        mul     rax
-
-        add     rbp, rbx
-        adc     r10, rax
-        adc     r11, rdx
-        sbb     r8, r8          ; -carry
-
-        mov     rax, [rsi + 16]
-        mul     rax
-
-        add     r8, r8
-        adc     r12, rax
-        adc     r13, rdx
-        sbb     r8, r8
-
-        mov     rax, [rsi + 24]
-        mul     rax
-        add     r8, r8
-        adc     r14, rax
-        adc     r15, rdx
-
-        ; See SymCryptFdefMontgomeryReduce256Asm for a discussion of this strange epilogue sequence
-        test    rsp,rsp
-        jne     SymCryptFdefMontgomerReduce256AsmInternal       ; jumps always
-
-        int     3
-
-        BEGIN_EPILOGUE
-        ret
-
-        NESTED_END      SymCryptFdefModSquareMontgomery256Asm, _TEXT
-
-; --------------------------------
-; 512-bit size specific functions
-; --------------------------------
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul512Asm(
-;    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
-;    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
-;                                                        UINT32      nDigits,
-;    _Out_writes_(2*nWords)                              PUINT32     pDst );
-        NESTED_ENTRY    SymCryptFdefRawMul512Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE        ; saves all registers
-
-        ; Basic structure:
-        ;   for each word in Src1:
-        ;       Dst += Src2 * word
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src1 to multiply with
-        ; rcx = pSrc1  (updated in outer loop)
-        ; rdx = tmp for mul
-        ; rsi = pSrc2 (constant)
-        ; rdi = pDst (incremented in outer loop)
-        ; r8  = nDigits (constant)
-        ; r9  = pDst (constant)
-        ; r11 = # words left from Src1 to process
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r11, r8             ; nDigits
-        shl     r11, 3              ; nDigits * 8 = # words in Src1 to process
-
-        mov     rsi, rdx            ; rsi = pSrc2
-        mov     rdi, r9             ; rdi = pDst
-        mov     rbx, [rcx]          ; mulword
-
-        xor     r12, r12            ; carry
-
-        ; First inner loop overwrites Dst, which avoids adding the current Dst value
-        MULT_SINGLEADD_128 0, rsi, rdi
-        MULT_SINGLEADD_128 2, rsi, rdi
-        MULT_SINGLEADD_128 4, rsi, rdi
-        MULT_SINGLEADD_128 6, rsi, rdi
-
-        mov     [rdi + 64], r12     ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1
-
-        ALIGN   16
-
-SymCryptFdefRawMul512AsmLoopOuter:
-
-        lea     rcx, [rcx + 8]      ; move to next word of pSrc1
-        lea     rdi, [rdi + 8]      ; move Dst pointer one word over
-
-        mov     rbx, [rcx]          ; mulword
-
-        xor     r12, r12            ; carry
-
-        MULT_DOUBLEADD_128 0, rsi, rdi
-        MULT_DOUBLEADD_128 2, rsi, rdi
-        MULT_DOUBLEADD_128 4, rsi, rdi
-        MULT_DOUBLEADD_128 6, rsi, rdi
-
-        mov     [rdi + 64], r12    ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1
-        jnz     SymCryptFdefRawMul512AsmLoopOuter
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefRawMul512Asm, _TEXT
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquareAsm(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-        NESTED_ENTRY    SymCryptFdefRawSquare512Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE
-
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src to multiply with
-        ; rcx = outer loop pointer into pSrc
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pDst (constant)
-        ; r9 = nDigits (constant)
-        ; r10 = outer loop pointer into pDst
-        ; r11 = outer loop counter of #words left
-        ; r12 = carry for even words (64 bits)
-        ; r13 = inner loop counter of #words left
-        ; r14 = pSrc (constant)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r9,  rdx            ; nDigits
-        mov     r14, rcx            ; saving pSrc
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; First Pass - Addition of the cross products x_i*x_j with i!=j
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     r11, rdx            ; nDigits
-        shl     r11, 3              ; r11 = outer #words
-        mov     r10, r8             ; r10 = outer pDst
-
-        mov     rsi, rcx            ; rsi = inner pSrc
-        mov     rdi, r10            ; rdi = inner pDst
-
-        ; Initial inner loop overwrites Dst, which avoids adding the current Dst value
-        ; 7 iterations
-        xor     r15, r15            ; carry = 0 (for "odd" iterations set only the r15 carry)
-        mov     rbx, [rcx]          ; mulword
-        mov     [rdi], r15          ; Write 0 in the first word
-
-        SQR_SINGLEADD_64 1, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 2, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 3, rsi, rdi, r15, r12
-
-        SQR_SINGLEADD_64 4, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 5, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 6, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 7, rsi, rdi, r15, r12
-
-        mov     [rdi + 8*8], r12    ; write last word, cannot overflow because Dst is at least 2 digits long
-        add     r10, 8              ; Skip over the first word
-
-        ; 6 iterations
-        xor     r12, r12            ; carry = 0 (for "even" iterations set only the r12 carry)
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        SQR_DOUBLEADD_64_4 2
-        mov     [rdi + 6*8], r12
-
-        ; 5 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12  ; Notice the dst_carry is r12 since all the "double" macros have r12 as src_carry
-        SQR_DOUBLEADD_64_4 1
-        mov     [rdi + 5*8], r12
-
-        ; 4 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_4 0
-        mov     [rdi + 4*8], r12
-
-        ; 3 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_2 1
-        mov     [rdi + 3*8], r12
-
-        ; 2 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        mov     [rdi + 2*8], r12
-
-        ; 1 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        mov     [rdi + 8], r12
-
-        xor     rdx, rdx
-        mov     [rdi + 16], rdx     ; Final word = 0
-
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Second Pass - Shifting all results 1 bit left
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     r11, r9             ; nDigits
-        mov     rdi, r8             ; pDst pointer
-        shl     r11, 1              ; 2*nDigits
-
-        ALIGN   16
-SymCryptFdefRawSquare512AsmSecondPass:
-        SQR_SHIFT_LEFT 0
-        SQR_SHIFT_LEFT 1
-        SQR_SHIFT_LEFT 2
-        SQR_SHIFT_LEFT 3
-
-        SQR_SHIFT_LEFT 4
-        SQR_SHIFT_LEFT 5
-        SQR_SHIFT_LEFT 6
-        SQR_SHIFT_LEFT 7
-
-        lea     rdi, [rdi + 64]
-        dec     r11
-        jnz     SymCryptFdefRawSquare512AsmSecondPass
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Third Pass - Adding the squares on the even columns and propagating the sum
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     rsi, r14            ; rsi = pSrc
-        mov     rdi, r8             ; rdi = pDst
-
-        xor     r12, r12
-
-        SQR_DIAGONAL_PROP 0
-        SQR_DIAGONAL_PROP 1
-        SQR_DIAGONAL_PROP 2
-        SQR_DIAGONAL_PROP 3
-        SQR_DIAGONAL_PROP 4
-        SQR_DIAGONAL_PROP 5
-        SQR_DIAGONAL_PROP 6
-        SQR_DIAGONAL_PROP 7
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefRawSquare512Asm, _TEXT
-
-;VOID
-;SymCryptFdefMontgomeryReduce512Asm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduce512Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE
-
-        mov     r11, rdx        ; r11 = pSrc
-        mov     ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64]                  ; nDigits
-        mov     r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        mov     edi, ebp        ; outer loop counter
-        shl     edi, 3          ; edi is in words
-
-        xor     r14d, r14d
-
-        ; General register allocations
-        ; rax = multiply result
-        ; rbx = multiplier in inner loop
-        ; rcx = pointer to modulus value
-        ; rdx = multiply result
-        ; rsi = loop counter
-        ; rdi = loop counter
-        ; rbp = nDigits
-        ; r8 = pDst
-        ; r9 = running pointer in Src
-        ; r10 = running pointer in Mod
-        ; r11 = pSrc (updated in outer loop)
-        ; r12 = carry
-        ; r13 = pmMod->tm.montgomery.inv64
-        ; r14 = carry out from last word of previous loop iteration
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduce512AsmOuterLoop:
-
-        ; start decoder with a few simple instructions, including at least one that requires
-        ; a uop execution and is on the critical path
-
-        mov     rbx, [r11]                      ; fetch word of Src we want to set to zero
-        mov     r10, r11
-        mov     r9, rcx
-
-        imul    rbx, r13                        ; lower word is same for signed & unsigned multiply
-
-        mov     esi, ebp
-        xor     r12d, r12d
-
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rcx = pointer to modulus value
-        ; rdx = mul scratch
-        ; edi = outer loop counter (words)
-        ; esi = inner loop counter (digits)
-        ; r9  = running ptr to modulus
-        ; r10 = running ptr to input/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        MULT_DOUBLEADD_128 0, r9, r10
-        MULT_DOUBLEADD_128 2, r9, r10
-        MULT_DOUBLEADD_128 4, r9, r10
-        MULT_DOUBLEADD_128 6, r9, r10
-
-        lea     r10,[r10 + 64]
-
-        add     r12, r14
-        mov     r14d, 0
-        adc     r14, 0
-        add     r12, [r10]
-        adc     r14, 0
-        mov     [r10], r12
-
-        lea     r11,[r11 + 8]
-
-        dec     edi
-        jnz     SymCryptFdefMontgomeryReduce512AsmOuterLoop
-
-        ;
-        ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
-        ;
-
-        ; First we compute the pSrc result minus the modulus into the destination
-        mov     esi, ebp        ; loop ctr
-        mov     r10, r11        ; pSrc
-        mov     r9, rcx         ; pMod
-        mov     r12, r8         ; pDst
-
-        ; Cy = 0 because the last 'sub edi,1' resulted in 0
-        mov     rax,[r10]
-        sbb     rax,[r9]
-        mov     [r12], rax
-
-        mov     rax,[r10 + 8]
-        sbb     rax,[r9 + 8]
-        mov     [r12 + 8], rax
-
-        mov     rax,[r10 + 16]
-        sbb     rax,[r9 + 16]
-        mov     [r12 + 16], rax
-
-        mov     rax,[r10 + 24]
-        sbb     rax,[r9 + 24]
-        mov     [r12 + 24], rax
-
-        mov     rax,[r10 + 32]
-        sbb     rax,[r9 + 32]
-        mov     [r12 + 32], rax
-
-        mov     rax,[r10 + 40]
-        sbb     rax,[r9 + 40]
-        mov     [r12 + 40], rax
-
-        mov     rax,[r10 + 48]
-        sbb     rax,[r9 + 48]
-        mov     [r12 + 48], rax
-
-        mov     rax,[r10 + 56]
-        sbb     rax,[r9 + 56]
-        mov     [r12 + 56], rax
-
-        lea     r10,[r10 + 64]
-        lea     r9,[r9 + 64]
-        lea     r12,[r12 + 64]
-
-        ; Finally a masked copy form pSrc to pDst
-        ; copy if: r14 == 0 && Cy = 1
-        sbb     r14d, 0
-
-        movd    xmm0, r14d          ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop:
-        movdqa  xmm2, [r11]         ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8]          ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8], xmm2
-
-        movdqa  xmm2, [r11 + 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 16], xmm2
-
-        movdqa  xmm2, [r11 + 32]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8 + 32]     ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8 + 32], xmm2
-
-        movdqa  xmm2, [r11 + 48]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 48]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 48], xmm2
-
-        ; Move on to the next digit
-        lea     r11,[r11 + 64]
-        lea     r8,[r8 + 64]
-
-        dec     ebp
-        jnz     SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefMontgomeryReduce512Asm, _TEXT
-
-
-; --------------------------------
-; 1024-bit size specific functions
-; --------------------------------
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul1024Asm(
-;    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
-;    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
-;                                                        UINT32      nDigits,
-;    _Out_writes_(2*nWords)                              PUINT32     pDst );
-        NESTED_ENTRY    SymCryptFdefRawMul1024Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE        ; saves all registers
-
-        ; Basic structure:
-        ;   for each word in Src1:
-        ;       Dst += Src2 * word
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src1 to multiply with
-        ; rcx = pSrc1  (updated in outer loop)
-        ; rdx = tmp for mul
-        ; rsi = pSrc2 (constant)
-        ; rdi = pDst (incremented in outer loop)
-        ; r8  = nDigits (constant)
-        ; r9  = pDst (constant)
-        ; r11 = # words left from Src1 to process
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r11, r8             ; nDigits
-        shl     r11, 3              ; nDigits * 8 = # words in Src1 to process
-
-        mov     rsi, rdx            ; rsi = pSrc2
-        mov     rdi, r9             ; rdi = pDst
-        mov     rbx, [rcx]          ; mulword
-
-        xor     r12, r12            ; carry
-
-        ; First inner loop overwrites Dst, which avoids adding the current Dst value
-        MULT_SINGLEADD_128 0, rsi, rdi
-        MULT_SINGLEADD_128 2, rsi, rdi
-        MULT_SINGLEADD_128 4, rsi, rdi
-        MULT_SINGLEADD_128 6, rsi, rdi
-
-        MULT_SINGLEADD_128 8, rsi, rdi
-        MULT_SINGLEADD_128 10, rsi, rdi
-        MULT_SINGLEADD_128 12, rsi, rdi
-        MULT_SINGLEADD_128 14, rsi, rdi
-
-        mov     [rdi + 128], r12    ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1
-
-        ALIGN   16
-
-SymCryptFdefRawMul1024AsmLoopOuter:
-
-        lea     rcx, [rcx + 8]      ; move to next word of pSrc1
-        lea     rdi, [rdi + 8]      ; move Dst pointer one word over
-
-        mov     rbx, [rcx]          ; mulword
-
-        xor     r12, r12            ; carry
-
-        MULT_DOUBLEADD_128 0, rsi, rdi
-        MULT_DOUBLEADD_128 2, rsi, rdi
-        MULT_DOUBLEADD_128 4, rsi, rdi
-        MULT_DOUBLEADD_128 6, rsi, rdi
-
-        MULT_DOUBLEADD_128 8, rsi, rdi
-        MULT_DOUBLEADD_128 10, rsi, rdi
-        MULT_DOUBLEADD_128 12, rsi, rdi
-        MULT_DOUBLEADD_128 14, rsi, rdi
-
-        mov     [rdi + 128], r12    ; write last word, cannot overflow because Dst is at least 2 digits long
-
-        sub     r11, 1
-        jnz     SymCryptFdefRawMul1024AsmLoopOuter
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefRawMul1024Asm, _TEXT
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquareAsm(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-        NESTED_ENTRY    SymCryptFdefRawSquare1024Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE
-
-        ; Register assignments
-        ;
-        ; rax = tmp for mul
-        ; rbx = word from Src to multiply with
-        ; rcx = outer loop pointer into pSrc
-        ; rdx = tmp for mul
-        ; rsi = inner loop pointer into pSrc
-        ; rdi = inner loop pointer into pDst
-        ; r8 = pDst (constant)
-        ; r9 = nDigits (constant)
-        ; r10 = outer loop pointer into pDst
-        ; r11 = outer loop counter of #words left
-        ; r12 = carry for even words (64 bits)
-        ; r13 = inner loop counter of #words left
-        ; r14 = pSrc (constant)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     r9,  rdx            ; nDigits
-        mov     r14, rcx            ; saving pSrc
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; First Pass - Addition of the cross products x_i*x_j with i!=j
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     r11, rdx            ; nDigits
-        shl     r11, 3              ; r11 = outer #words
-        mov     r10, r8             ; r10 = outer pDst
-
-        mov     rsi, rcx            ; rsi = inner pSrc
-        mov     rdi, r10            ; rdi = inner pDst
-
-        ; Initial inner loop overwrites Dst, which avoids adding the current Dst value
-
-        ; 15 iterations
-        xor     r15, r15            ; carry = 0 (for "odd" iterations set only the r15 carry)
-        mov     rbx, [rcx]          ; mulword
-        mov     [rdi], r15          ; Write 0 in the first word
-
-        SQR_SINGLEADD_64 1, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 2, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 3, rsi, rdi, r15, r12
-
-        SQR_SINGLEADD_64 4, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 5, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 6, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 7, rsi, rdi, r15, r12
-
-        SQR_SINGLEADD_64 8, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 9, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 10, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 11, rsi, rdi, r15, r12
-
-        SQR_SINGLEADD_64 12, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 13, rsi, rdi, r15, r12
-        SQR_SINGLEADD_64 14, rsi, rdi, r12, r15
-        SQR_SINGLEADD_64 15, rsi, rdi, r15, r12
-
-        mov     [rdi + 16*8], r12       ; write last word, cannot overflow because Dst is at least 2 digits long
-        add     r10, 8                  ; Skip over the first word
-
-        ; 14 iterations (adding the current Dst value)
-        xor     r12, r12            ; carry = 0 (for "even" iterations set only the r12 carry)
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        SQR_DOUBLEADD_64_4 2
-        SQR_DOUBLEADD_64_8 6
-        mov     [rdi + 14*8], r12
-
-        ; 13 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12  ; Notice the dst_carry is r12 since all the "double" macros have r12 as src_carry
-        SQR_DOUBLEADD_64_4 1
-        SQR_DOUBLEADD_64_8 5
-        mov     [rdi + 13*8], r12
-
-        ; 12 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_4 0
-        SQR_DOUBLEADD_64_8 4
-        mov     [rdi + 12*8], r12
-
-        ; 11 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_2 1
-        SQR_DOUBLEADD_64_8 3
-        mov     [rdi + 11*8], r12
-
-        ; 10 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        SQR_DOUBLEADD_64_8 2
-        mov     [rdi + 10*8], r12
-
-
-        ; 9 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_8 1
-        mov     [rdi + 9*8], r12
-
-        ; 8 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_8 0
-        mov     [rdi + 8*8], r12
-
-        ; 7 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_2 1
-        SQR_DOUBLEADD_64_4 3
-        mov     [rdi + 7*8], r12
-
-        ; 6 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        SQR_DOUBLEADD_64_4 2
-        mov     [rdi + 6*8], r12
-
-        ; 5 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_4 1
-        mov     [rdi + 5*8], r12
-
-        ; 4 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_4 0
-        mov     [rdi + 4*8], r12
-
-        ; 3 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        SQR_DOUBLEADD_64_2 1
-        mov     [rdi + 3*8], r12
-
-        ; 2 iterations
-        xor     r12, r12
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64_2 0
-        mov     [rdi + 2*8], r12
-
-        ; 1 iterations
-        xor     r15, r15
-        SQR_SIZE_SPECIFIC_INIT
-        SQR_DOUBLEADD_64 0, rsi, rdi, r15, r12
-        mov     [rdi + 8], r12
-
-        xor     rdx, rdx
-        mov     [rdi + 16], rdx     ; Final word = 0
-
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Second Pass - Shifting all results 1 bit left
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        xor rax, rax                ; carry flag = 0
-        ; mov     r11, r9             ; nDigits
-        mov     rdi, r8             ; pDst pointer
-        ; shl     r11, 1              ; 2*nDigits
-
-        ; ALIGN   16
-; SymCryptFdefRawSquare1024AsmSecondPass:
-        SQR_SHIFT_LEFT 0
-        SQR_SHIFT_LEFT 1
-        SQR_SHIFT_LEFT 2
-        SQR_SHIFT_LEFT 3
-
-        SQR_SHIFT_LEFT 4
-        SQR_SHIFT_LEFT 5
-        SQR_SHIFT_LEFT 6
-        SQR_SHIFT_LEFT 7
-
-        SQR_SHIFT_LEFT 8
-        SQR_SHIFT_LEFT 9
-        SQR_SHIFT_LEFT 10
-        SQR_SHIFT_LEFT 11
-
-        SQR_SHIFT_LEFT 12
-        SQR_SHIFT_LEFT 13
-        SQR_SHIFT_LEFT 14
-        SQR_SHIFT_LEFT 15
-
-        SQR_SHIFT_LEFT 16
-        SQR_SHIFT_LEFT 17
-        SQR_SHIFT_LEFT 18
-        SQR_SHIFT_LEFT 19
-
-        SQR_SHIFT_LEFT 20
-        SQR_SHIFT_LEFT 21
-        SQR_SHIFT_LEFT 22
-        SQR_SHIFT_LEFT 23
-
-        SQR_SHIFT_LEFT 24
-        SQR_SHIFT_LEFT 25
-        SQR_SHIFT_LEFT 26
-        SQR_SHIFT_LEFT 27
-
-        SQR_SHIFT_LEFT 28
-        SQR_SHIFT_LEFT 29
-        SQR_SHIFT_LEFT 30
-        SQR_SHIFT_LEFT 31
-
-        ; lea     rdi, [rdi + 64]
-        ; dec     r11
-        ; jnz     SymCryptFdefRawSquare1024AsmSecondPass
-
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Third Pass - Adding the squares on the even columns and propagating the sum
-        ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-        mov     rsi, r14            ; rsi = pSrc
-        mov     rdi, r8             ; rdi = pDst
-
-        xor     r12, r12
-
-        SQR_DIAGONAL_PROP 0
-        SQR_DIAGONAL_PROP 1
-        SQR_DIAGONAL_PROP 2
-        SQR_DIAGONAL_PROP 3
-        SQR_DIAGONAL_PROP 4
-        SQR_DIAGONAL_PROP 5
-        SQR_DIAGONAL_PROP 6
-        SQR_DIAGONAL_PROP 7
-
-        SQR_DIAGONAL_PROP 8
-        SQR_DIAGONAL_PROP 9
-        SQR_DIAGONAL_PROP 10
-        SQR_DIAGONAL_PROP 11
-        SQR_DIAGONAL_PROP 12
-        SQR_DIAGONAL_PROP 13
-        SQR_DIAGONAL_PROP 14
-        SQR_DIAGONAL_PROP 15
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefRawSquare1024Asm, _TEXT
-
-;VOID
-;SymCryptFdefMontgomeryReduce1024Asm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduce1024Asm, _TEXT
-
-        MULT_COMMON_PROLOGUE
-
-        mov     r11, rdx        ; r11 = pSrc
-        mov     ebp, [rcx + SymCryptModulusNdigitsOffsetAmd64]                  ; nDigits
-        mov     r13, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        mov     edi, ebp        ; outer loop counter
-        shl     edi, 3          ; edi is in words
-
-        xor     r14d, r14d
-
-        ; General register allocations
-        ; rax = multiply result
-        ; rbx = multiplier in inner loop
-        ; rcx = pointer to modulus value
-        ; rdx = multiply result
-        ; rsi = loop counter
-        ; rdi = loop counter
-        ; rbp = nDigits
-        ; r8 = pDst
-        ; r9 = running pointer in Src
-        ; r10 = running pointer in Mod
-        ; r11 = pSrc (updated in outer loop)
-        ; r12 = carry
-        ; r13 = pmMod->tm.montgomery.inv64
-        ; r14 = carry out from last word of previous loop iteration
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduce1024AsmOuterLoop:
-
-        ; start decoder with a few simple instructions, including at least one that requires
-        ; a uop execution and is on the critical path
-
-        mov     rbx, [r11]                      ; fetch word of Src we want to set to zero
-        mov     r10, r11
-        mov     r9, rcx
-
-        imul    rbx, r13                        ; lower word is same for signed & unsigned multiply
-
-        mov     esi, ebp
-        xor     r12d, r12d
-
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rcx = pointer to modulus value
-        ; rdx = mul scratch
-        ; edi = outer loop counter (words)
-        ; esi = inner loop counter (digits)
-        ; r9  = running ptr to modulus
-        ; r10 = running ptr to input/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        MULT_DOUBLEADD_128 0, r9, r10
-        MULT_DOUBLEADD_128 2, r9, r10
-        MULT_DOUBLEADD_128 4, r9, r10
-        MULT_DOUBLEADD_128 6, r9, r10
-
-        MULT_DOUBLEADD_128 8, r9, r10
-        MULT_DOUBLEADD_128 10, r9, r10
-        MULT_DOUBLEADD_128 12, r9, r10
-        MULT_DOUBLEADD_128 14, r9, r10
-
-        lea     r10,[r10 + 128]
-
-        add     r12, r14
-        mov     r14d, 0
-        adc     r14, 0
-        add     r12, [r10]
-        adc     r14, 0
-        mov     [r10], r12
-
-        lea     r11,[r11 + 8]
-
-        dec     edi
-        jnz     SymCryptFdefMontgomeryReduce1024AsmOuterLoop
-
-        ;
-        ; Most of the work is done; now all that is left is subtract the modulus if it is smaller than the result
-        ;
-
-        ; First we compute the pSrc result minus the modulus into the destination
-        mov     esi, ebp        ; loop ctr
-        mov     r10, r11        ; pSrc
-        mov     r9, rcx         ; pMod
-        mov     r12, r8         ; pDst
-
-        ; Cy = 0 because the last 'sub edi,1' resulted in 0
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduce1024AsmSubLoop:
-        mov     rax,[r10]
-        sbb     rax,[r9]
-        mov     [r12], rax
-
-        mov     rax,[r10 + 8]
-        sbb     rax,[r9 + 8]
-        mov     [r12 + 8], rax
-
-        mov     rax,[r10 + 16]
-        sbb     rax,[r9 + 16]
-        mov     [r12 + 16], rax
-
-        mov     rax,[r10 + 24]
-        sbb     rax,[r9 + 24]
-        mov     [r12 + 24], rax
-
-        mov     rax,[r10 + 32]
-        sbb     rax,[r9 + 32]
-        mov     [r12 + 32], rax
-
-        mov     rax,[r10 + 40]
-        sbb     rax,[r9 + 40]
-        mov     [r12 + 40], rax
-
-        mov     rax,[r10 + 48]
-        sbb     rax,[r9 + 48]
-        mov     [r12 + 48], rax
-
-        mov     rax,[r10 + 56]
-        sbb     rax,[r9 + 56]
-        mov     [r12 + 56], rax
-
-        lea     r10,[r10 + 64]
-        lea     r9,[r9 + 64]
-        lea     r12,[r12 + 64]
-
-        dec     esi
-        jnz     SymCryptFdefMontgomeryReduce1024AsmSubLoop
-
-        ; Finally a masked copy form pSrc to pDst
-        ; copy if: r14 == 0 && Cy = 1
-        sbb     r14d, 0
-
-        movd    xmm0, r14d          ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-        ALIGN   16
-
-SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop:
-        movdqa  xmm2, [r11]         ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8]          ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8], xmm2
-
-        movdqa  xmm2, [r11 + 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 16], xmm2
-
-        movdqa  xmm2, [r11 + 32]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8 + 32]     ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8 + 32], xmm2
-
-        movdqa  xmm2, [r11 + 48]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [r8  + 48]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          ;
-        pand    xmm3, xmm1          ;
-        por     xmm2, xmm3
-        movdqa  [r8  + 48], xmm2
-
-        ; Move on to the next digit
-        lea     r11,[r11 + 64]
-        lea     r8,[r8 + 64]
-
-        dec     ebp
-        jnz     SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop
-
-        MULT_COMMON_EPILOGUE
-
-        NESTED_END      SymCryptFdefMontgomeryReduce1024Asm, _TEXT
-
-        end
diff --git a/lib/amd64/fdef_asm.symcryptasm b/lib/amd64/fdef_asm.symcryptasm
new file mode 100644
index 0000000..cfe1f53
--- /dev/null
+++ b/lib/amd64/fdef_asm.symcryptasm
@@ -0,0 +1,2135 @@
+//
+//  fdef_asm.symcryptasm   Assembler code for large integer arithmetic in the default data format
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#include "symcryptasm_shared.cppasm"
+
+
+MACRO_START(MULT_SINGLEADD_128, index, src_reg, dst_reg, Q0, QH, mul_word, even_carry, odd_carry)
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // mul_word = multiplier
+        // src_reg = running ptr to input
+        // dst_reg = running ptr to output/scratch
+        // even_carry = carry for even words (64 bits)
+        // odd_carry = carry for odd words (64 bits)
+
+        mov     Q0, [src_reg + 8*index]
+        mul     mul_word
+        mov     odd_carry, QH
+        add     Q0, even_carry
+        mov     [dst_reg + 8*index], Q0
+        adc     odd_carry, 0
+
+        mov     Q0, [src_reg + 8*(index+1)]
+        mul     mul_word
+        mov     even_carry, QH
+        add     Q0, odd_carry
+        mov     [dst_reg + 8*(index+1)], Q0
+        adc     even_carry, 0
+MACRO_END()
+
+MACRO_START(MULT_DOUBLEADD_128, index, src_reg, dst_reg, Q0, QH, mul_word, even_carry, odd_carry)
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // mul_word = multiplier
+        // src_reg = running ptr to input
+        // dst_reg = running ptr to output/scratch
+        // even_carry = carry for even words (64 bits)
+        // odd_carry = carry for odd words (64 bits)
+
+        mov     Q0, [src_reg + 8*index]
+        mul     mul_word
+        mov     odd_carry, QH
+        add     Q0, [dst_reg + 8*index]
+        adc     odd_carry, 0
+        add     Q0, even_carry
+        mov     [dst_reg + 8*index], Q0
+        adc     odd_carry, 0
+
+        mov     Q0, [src_reg + 8*(index+1)]
+        mul     mul_word
+        mov     even_carry, QH
+        add     Q0, [dst_reg + 8*(index+1)]
+        adc     even_carry, 0
+        add     Q0, odd_carry
+        mov     [dst_reg + 8*(index+1)], Q0
+        adc     even_carry, 0
+MACRO_END()
+
+// Squaring
+
+MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry)
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // mul_word = multiplier
+        // src_reg = running ptr to input
+        // dst_reg = running ptr to output/scratch
+        // src_carry = input carry
+        // dst_carry = output carry
+
+        mov     Q0, [src_reg + 8*index]
+        mul     mul_word
+        mov     dst_carry, QH
+        add     Q0, src_carry
+        mov     [dst_reg + 8*index], Q0
+        adc     dst_carry, 0
+MACRO_END()
+
+MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry)
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // mul_word = multiplier
+        // src_reg = running ptr to input
+        // dst_reg = running ptr to output/scratch
+        // src_carry = input carry
+        // dst_carry = output carry
+
+        mov     Q0, [src_reg + 8*index]
+        mul     mul_word
+        mov     dst_carry, QH
+        add     Q0, [dst_reg + 8*index]
+        adc     dst_carry, 0
+        add     Q0, src_carry
+        mov     [dst_reg + 8*index], Q0
+        adc     dst_carry, 0
+MACRO_END()
+
+MACRO_START(SQR_SHIFT_LEFT, index, Q0, src_reg)
+    mov     Q0, [src_reg + 8*index]
+    adc     Q0, Q0                 // Shift left and add the carry
+    mov     [src_reg + 8*index], Q0
+MACRO_END()
+
+MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, Q0, QH, carry)
+    // Calculating the square
+    mov     Q0, [src_reg + 8*index]     // mulword
+    mul     Q0                     // m^2
+
+    // Adding the square to the even column
+    add     Q0, [dst_reg + 16*index]
+    adc     QH, 0
+    add     Q0, carry
+    adc     QH, 0
+    mov     [dst_reg + 16*index], Q0
+
+    // Propagating the sum to the next column
+    mov     Q0, QH
+    xor     QH, QH
+
+    add     Q0, [dst_reg + 16*index + 8]
+    adc     QH, 0
+    mov     [dst_reg + 16*index + 8], Q0
+    mov     carry, QH
+MACRO_END()
+
+MACRO_START(MONTGOMERY14, Q0, QH, mul_word, pA, R0, R1, R2, R3, Cy)
+    // (xx, R1, R2, R3, QH) = mul_word * (A0..3) + (R0, R1, R2, R3)
+    // Used when it is statically known that R0 will get set to 0, so we don't bother computing it
+    // Cy, Q0 = scratch
+
+    mov     Q0, [pA]
+    mul     mul_word
+    add     R0, -1  // set carry flag only when R0 is non-zero
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 8]
+    mul     mul_word
+    add     R1, Q0
+    adc     QH, 0
+    add     R1, Cy
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 16]
+    mul     mul_word
+    add     R2, Q0
+    adc     QH, 0
+    add     R2, Cy
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 24]
+    mul     mul_word
+    add     R3, Q0
+    adc     QH, 0
+    add     R3, Cy
+    adc     QH, 0
+MACRO_END()
+
+MACRO_START(MUL14, Q0, QH, mul_word, pA, R0, R1, R2, R3, Cy)
+    // (R0, R1, R2, R3, QH) = mul_word * (A0..3) + (R0, R1, R2, R3)
+    // Cy, Q0 = scratch
+
+    mov     Q0, [pA]
+    mul     mul_word
+    add     R0, Q0
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 8]
+    mul     mul_word
+    add     R1, Q0
+    adc     QH, 0
+    add     R1, Cy
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 16]
+    mul     mul_word
+    add     R2, Q0
+    adc     QH, 0
+    add     R2, Cy
+    adc     QH, 0
+    mov     Cy, QH
+
+    mov     Q0, [pA + 24]
+    mul     mul_word
+    add     R3, Q0
+    adc     QH, 0
+    add     R3, Cy
+    adc     QH, 0
+MACRO_END()
+
+// Macros for size-specific squaring
+MACRO_START(SQR_DOUBLEADD_64_2, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry)
+    SQR_DOUBLEADD_64    (index),     src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry
+    SQR_DOUBLEADD_64    (index + 1), src_reg, dst_reg, Q0, QH, mul_word, dst_carry, src_carry
+MACRO_END()
+
+MACRO_START(SQR_DOUBLEADD_64_4, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry)
+    SQR_DOUBLEADD_64_2  (index),     src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry
+    SQR_DOUBLEADD_64_2  (index + 2), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry
+MACRO_END()
+
+MACRO_START(SQR_DOUBLEADD_64_8, index, src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry)
+    SQR_DOUBLEADD_64_4  (index),     src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry
+    SQR_DOUBLEADD_64_4  (index + 4), src_reg, dst_reg, Q0, QH, mul_word, src_carry, dst_carry
+MACRO_END()
+
+MACRO_START(SQR_SIZE_SPECIFIC_INIT, outer_src_reg, outer_dst_reg, inner_src_reg, inner_dst_reg, mul_word)
+    lea     outer_src_reg, [outer_src_reg + 8]  // move outer_src_reg pointer 1 word over
+    lea     outer_dst_reg, [outer_dst_reg + 16] // move outer_dst_reg pointer 2 words over
+
+    mov     inner_src_reg, outer_src_reg        // inner_src_reg = outer_src_reg
+    mov     inner_dst_reg, outer_dst_reg        // inner_dst_reg = outer_dst_reg
+
+    mov     mul_word, [outer_src_reg]           // Get the next mulword
+    lea     inner_src_reg, [inner_src_reg + 8]  // move inner_src_reg pointer 1 word over
+MACRO_END()
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdefRawAdd(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(SymCryptFdefRawAddAsm, 4, 5)
+
+        // loop over each half digit
+        add     D4, D4
+        xor     Q0, Q0
+
+SymCryptFdefRawAddAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        adc     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        adc     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        adc     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        mov     Q0,[Q1 + 24]
+        adc     Q0,[Q2 + 24]
+        mov     [Q3 + 24], Q0
+
+        lea     Q1, [Q1 + 32]
+        lea     Q2, [Q2 + 32]
+        lea     Q3, [Q3 + 32]
+        dec     D4
+        jnz     SymCryptFdefRawAddAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdefRawAddAsm)
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdefRawSub(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(SymCryptFdefRawSubAsm, 4, 5)
+
+        // loop over each half digit
+        add     D4, D4
+        xor     Q0, Q0
+
+SymCryptFdefRawSubAsmLoop:
+        // carry is in the carry flag
+        mov     Q0,[Q1]
+        sbb     Q0,[Q2]
+        mov     [Q3],Q0
+
+        mov     Q0,[Q1 + 8]
+        sbb     Q0,[Q2 + 8]
+        mov     [Q3 + 8], Q0
+
+        mov     Q0,[Q1 + 16]
+        sbb     Q0,[Q2 + 16]
+        mov     [Q3 + 16], Q0
+
+        mov     Q0,[Q1 + 24]
+        sbb     Q0,[Q2 + 24]
+        mov     [Q3 + 24], Q0
+
+        lea     Q1,[Q1 + 32]
+        lea     Q2,[Q2 + 32]
+        lea     Q3,[Q3 + 32]
+        dec     D4
+        jnz     SymCryptFdefRawSubAsmLoop
+
+        mov     Q0, 0
+        adc     Q0, Q0
+
+FUNCTION_END(SymCryptFdefRawSubAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefMaskedCopy(
+//    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
+//    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
+//                                                                UINT32      nDigits,
+//                                                                UINT32      mask )
+
+FUNCTION_START(SymCryptFdefMaskedCopyAsm, 4, 4)
+
+        add     D3, D3              // loop over half digits
+
+        movd    xmm0, D4            // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+SymCryptFdefMaskedCopyAsmLoop:
+        movdqa  xmm2, [Q1]          // xmm2 = pSrc[i]
+        movdqa  xmm3, [Q2]          // xmm3 = pDst[i]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q2], xmm2
+
+        movdqa  xmm2, [Q1 + 16]     // xmm2 = pSrc[i + 16]
+        movdqa  xmm3, [Q2 + 16]     // xmm3 = pDst[i + 16]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q2 + 16], xmm2
+
+        // Move on to the next digit
+
+        add     Q1, 32
+        add     Q2, 32
+        dec     D3
+        jnz     SymCryptFdefMaskedCopyAsmLoop
+
+FUNCTION_END(SymCryptFdefMaskedCopyAsm)
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMul(
+//    _In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                      UINT32      nDigits1,
+//    _In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                      UINT32      nDigits2,
+//    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)     PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawMulAsm, 5, 12)
+
+        shl     Q2, 3           // nDigits1 * 8 = # words in Src1 to process
+
+        // Basic structure:
+        //   for each word in Src1:
+        //       Dst += Src2 * word
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = pSrc1  (updated in outer loop)
+        // Q2 = # words left from Src1 to process
+        // Q3 = pSrc2
+        // Q4 = nDigits2
+        // Q5 = pDst (incremented in outer loop)
+        // Q6 = inner loop pointer into pSrc2
+        // Q7 = inner loop pointer into pDst
+        // Q8 = word from Src1 to multiply with
+        // Q9 = carry for even words (64 bits)
+        // Q10 = inner loop counter
+        // Q11 = carry for odd words (64 bits)
+
+
+        // Outer loop invariant established: Q1, Q3, Q4, Q5
+
+        mov     Q6, Q3          // Q6 = pSrc2
+        mov     Q7, Q5          // Q7 = pDst + outer loop ctr
+        mov     Q8, [Q1]        // mulword
+        xor     Q9, Q9
+        mov     Q10, Q4
+
+        // First inner loop overwrites Dst, which avoids adding the current Dst value
+
+ALIGN(16)
+
+SymCryptFdefRawMulAsmLoop1:
+        MULT_SINGLEADD_128 0, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_SINGLEADD_128 2, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_SINGLEADD_128 4, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_SINGLEADD_128 6, Q6, Q7, Q0, QH, Q8, Q9, Q11
+
+        lea     Q6,[Q6 + 64]
+        lea     Q7,[Q7 + 64]
+
+        dec     Q10
+        jnz     SymCryptFdefRawMulAsmLoop1
+
+        mov     [Q7], Q9        // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q2
+
+ALIGN(16)
+
+SymCryptFdefRawMulAsmLoopOuter:
+
+        add     Q1, 8           // move to next word of pSrc1
+        add     Q5, 8           // move Dst pointer one word over
+        mov     Q8, [Q1]
+        mov     Q6, Q3
+        mov     Q7, Q5
+        xor     Q9, Q9
+        mov     Q10, Q4
+
+ALIGN(16)
+
+SymCryptFdefRawMulAsmLoop2:
+        MULT_DOUBLEADD_128 0, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_DOUBLEADD_128 2, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_DOUBLEADD_128 4, Q6, Q7, Q0, QH, Q8, Q9, Q11
+        MULT_DOUBLEADD_128 6, Q6, Q7, Q0, QH, Q8, Q9, Q11
+
+        lea     Q6,[Q6 + 64]
+        lea     Q7,[Q7 + 64]
+
+        dec     Q10
+        jnz     SymCryptFdefRawMulAsmLoop2
+
+        mov     [Q7], Q9        // write next word. (stays within Dst buffer)
+
+        dec     Q2
+        jnz     SymCryptFdefRawMulAsmLoopOuter
+
+MUL_FUNCTION_END(SymCryptFdefRawMulAsm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareAsm(
+//   _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)        PCUINT32    pSrc,
+//                                                          UINT32      nDigits,
+//   _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawSquareAsm, 3, 13)
+
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = outer loop pointer into pSrc
+        // Q2 = nDigits (constant)
+        // Q3 = pDst (constant)
+        // Q4 = inner loop pointer into pSrc
+        // Q5 = inner loop pointer into pDst
+        // Q6 = word from Src to multiply with
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = outer loop pointer into pDst
+        // Q10 = outer loop counter of #words left
+        // Q11 = inner loop counter of #words left
+        // Q12 = cyclic counter that specifies on which branch we jump into
+
+        ////////////////////////////////////////////////////////////////
+        // First Pass - Addition of the cross products x_i*x_j with i!=j
+        ////////////////////////////////////////////////////////////////
+        //
+        // At the beginning of each inner loop we will jump over the
+        // words that don't need processing. The decision of the jump
+        // will be based on the cyclic counter Q12.
+        //
+        // For the first pass we loop over **half** digits since having a smaller
+        // number of jumps (i.e. 4) is actually faster than having 8 jumps.
+        //
+        ////////////////////////////////////////////////////////////////
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1   // save pSrc
+
+        mov     Q10, Q2             // nDigits
+        shl     Q10, 3              // Q10 = outer #words
+        mov     Q9, Q3              // Q9 = outer pDst
+
+        mov     Q4, Q1              // Q4 = inner pSrc
+        mov     Q5, Q3              // Q5 = inner pDst
+
+        // Initial inner loop overwrites Dst, which avoids adding the current Dst value
+
+        mov     Q6, [Q1]            // mulword
+
+        xor     Q7, Q7              // carry = 0
+        xor     Q8, Q8              // carry = 0
+
+        mov     Q11, Q10            // Q11 = inner #words
+        mov     [Q5], Q7            // Write 0 in the first word
+
+        // Skip over the first word
+        jmp     SymCryptFdefRawSquareAsmInnerLoopInit_Word1
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoopInit_Word0:
+        SQR_SINGLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoopInit_Word1:
+        SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+
+        SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        lea     Q4, [Q4 + 32]
+        lea     Q5, [Q5 + 32]
+        sub     Q11, 4
+        jnz     SymCryptFdefRawSquareAsmInnerLoopInit_Word0
+
+        mov     [Q5], Q7                // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q10                     // Counter for the outer loop
+        mov     Q12, 1                  // Cyclic counter Q12 = 1
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmLoopOuter:
+
+        add     Q9, 8                   // move Dst pointer 1 word over
+
+        mov     Q4, Q1                  // Q4 = inner pSrc
+        mov     Q5, Q9                  // Q5 = inner pDst
+
+        mov     Q6, [Q1 + 8*Q12]        // Get the next mulword
+
+        inc     B12                     // Increment the cyclic counter by 1
+
+        mov     Q11, Q10                // # of words for the inner loop
+        add     Q11, 2
+        and     Q11, -4                 // Zero out the 2 lower bits
+
+        xor     Q7, Q7                  // carry = 0
+        xor     Q8, Q8                  // carry = 0
+
+        // Logic to find the correct jump
+        cmp     B12, 3
+        je      SymCryptFdefRawSquareAsmInnerLoop_Word3
+        cmp     B12, 2
+        je      SymCryptFdefRawSquareAsmInnerLoop_Word2
+        cmp     B12, 1
+        je      SymCryptFdefRawSquareAsmInnerLoop_Word1
+
+        // The following instructions are only executed when B12 == 4
+        xor     B12, B12                // Set it to 0 for the next iteration
+
+        add     Q1, 32                  // move pSrc 4 words over
+        add     Q9, 32                  // move destination 4 words over
+
+        mov     Q4, Q1                  // Q4 = inner pSrc
+        mov     Q5, Q9                  // Q5 = inner pDst
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoop_Word0:
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoop_Word1:
+        SQR_DOUBLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoop_Word2:
+        SQR_DOUBLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmInnerLoop_Word3:
+        SQR_DOUBLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        lea     Q4, [Q4 + 32]
+        lea     Q5, [Q5 + 32]
+        sub     Q11, 4
+        jnz     SymCryptFdefRawSquareAsmInnerLoop_Word0
+
+        mov     [Q5], Q7            // write next word. (stays within Dst buffer)
+
+        dec     Q10
+        cmp     Q10, 1
+        jne     SymCryptFdefRawSquareAsmLoopOuter
+
+        xor     QH, QH
+        mov     [Q9 + 40], QH       // Final word = 0
+
+
+        ////////////////////////////////////////////////////////////////
+        // Second Pass - Shifting all results 1 bit left
+        ////////////////////////////////////////////////////////////////
+
+        mov     Q10, Q2             // nDigits
+        mov     Q5, Q3              // pDst pointer
+        shl     Q10, 1              // 2*nDigits
+
+ALIGN(16)
+SymCryptFdefRawSquareAsmSecondPass:
+        SQR_SHIFT_LEFT 0, Q0, Q5
+        SQR_SHIFT_LEFT 1, Q0, Q5
+        SQR_SHIFT_LEFT 2, Q0, Q5
+        SQR_SHIFT_LEFT 3, Q0, Q5
+
+        SQR_SHIFT_LEFT 4, Q0, Q5
+        SQR_SHIFT_LEFT 5, Q0, Q5
+        SQR_SHIFT_LEFT 6, Q0, Q5
+        SQR_SHIFT_LEFT 7, Q0, Q5
+
+        lea     Q5, [Q5 + 64]
+        dec     Q10
+        jnz     SymCryptFdefRawSquareAsmSecondPass
+
+        //////////////////////////////////////////////////////////////////////////////
+        // Third Pass - Adding the squares on the even columns and propagating the sum
+        //////////////////////////////////////////////////////////////////////////////
+
+        mov     Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)]  // Q1 = pSrc
+
+SymCryptFdefRawSquareAsmThirdPass:
+        SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q10
+        SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q10
+
+        add     Q1, 64              // One digit up
+        add     Q3, 128             // Two digits up
+        dec     Q2
+        jnz     SymCryptFdefRawSquareAsmThirdPass
+
+MUL_FUNCTION_END(SymCryptFdefRawSquareAsm)
+
+//VOID
+//SymCryptFdefMontgomeryReduceAsm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceAsm, 3, 14)
+
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]            // nDigits
+        mov     Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]    // inv64
+
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]              // modulus value
+
+        mov     D13, D4         // outer loop counter
+        shl     D13, 3          // D13 is in words
+
+        xor     D9, D9
+
+        // General register allocations
+        // Q0 = multiply result
+        // QH = multiply result
+        // Q1 = pointer to modulus value
+        // Q2 = pSrc (updated in outer loop)
+        // Q3 = pDst
+        // D4 = nDigits
+        // Q5 = pmMod->tm.montgomery.inv64
+        // Q6 = multiplier in inner loop
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = carry out from last word of previous loop iteration
+        // Q10 = running pointer in Src
+        // Q11 = running pointer in Mod
+        // Q12 = loop counter
+        // Q13 = loop counter
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduceAsmOuterLoop:
+
+        // start decoder with a few simple instructions, including at least one that requires
+        // a uop execution and is on the critical path
+
+        mov     Q6, [Q2]                        // fetch word of Src we want to set to zero
+        mov     Q11, Q2
+        mov     Q10, Q1
+
+        imul    Q6, Q5                          // lower word is same for signed & unsigned multiply
+
+        mov     D12, D4
+        xor     D7, D7
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduceAsmInnerloop:
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // Q6 = multiplier
+        // Q1 = pointer to modulus value
+        // D13 = outer loop counter (words)
+        // D12 = inner loop counter (digits)
+        // Q10  = running ptr to modulus
+        // Q11 = running ptr to input/scratch
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+
+        MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8
+
+        lea     Q10,[Q10 + 64]
+        lea     Q11,[Q11 + 64]
+
+        dec     D12
+        jnz     SymCryptFdefMontgomeryReduceAsmInnerloop
+
+        add     Q7, Q9
+        mov     D9, 0
+        adc     Q9, 0
+        add     Q7, [Q11]
+        adc     Q9, 0
+        mov     [Q11], Q7
+
+        lea     Q2,[Q2 + 8]
+
+        dec     D13
+        jnz     SymCryptFdefMontgomeryReduceAsmOuterLoop
+
+        //
+        // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
+        //
+
+        // First we compute the pSrc result minus the modulus into the destination
+        mov     D12, D4         // loop ctr
+        mov     Q11, Q2         // pSrc
+        mov     Q10, Q1         // pMod
+        mov     Q7, Q3          // pDst
+
+        // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduceAsmSubLoop:
+        mov     Q0,[Q11]
+        sbb     Q0,[Q10]
+        mov     [Q7], Q0
+
+        mov     Q0,[Q11 + 8]
+        sbb     Q0,[Q10 + 8]
+        mov     [Q7 + 8], Q0
+
+        mov     Q0,[Q11 + 16]
+        sbb     Q0,[Q10 + 16]
+        mov     [Q7 + 16], Q0
+
+        mov     Q0,[Q11 + 24]
+        sbb     Q0,[Q10 + 24]
+        mov     [Q7 + 24], Q0
+
+        mov     Q0,[Q11 + 32]
+        sbb     Q0,[Q10 + 32]
+        mov     [Q7 + 32], Q0
+
+        mov     Q0,[Q11 + 40]
+        sbb     Q0,[Q10 + 40]
+        mov     [Q7 + 40], Q0
+
+        mov     Q0,[Q11 + 48]
+        sbb     Q0,[Q10 + 48]
+        mov     [Q7 + 48], Q0
+
+        mov     Q0,[Q11 + 56]
+        sbb     Q0,[Q10 + 56]
+        mov     [Q7 + 56], Q0
+
+        lea     Q11,[Q11 + 64]
+        lea     Q10,[Q10 + 64]
+        lea     Q7,[Q7 + 64]
+
+        dec     D12
+        jnz     SymCryptFdefMontgomeryReduceAsmSubLoop
+
+        // Finally a masked copy form pSrc to pDst
+        // copy if: Q9 == 0 && Cy = 1
+        sbb     D9, 0
+
+        movd    xmm0, D9            // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop:
+        movdqa  xmm2, [Q2]          // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3]          // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3], xmm2
+
+        movdqa  xmm2, [Q2 + 16]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 16]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 16], xmm2
+
+        movdqa  xmm2, [Q2 + 32]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 32]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 32], xmm2
+
+        movdqa  xmm2, [Q2 + 48]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 48]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 48], xmm2
+
+        // Move on to the next digit
+        lea     Q2,[Q2 + 64]
+        lea     Q3,[Q3 + 64]
+
+        dec     D4
+        jnz     SymCryptFdefMontgomeryReduceAsmMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceAsm)
+
+
+// --------------------------------
+// 256-bit size specific functions
+// --------------------------------
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefModAdd256(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
+//    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
+//    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
+//    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
+//                                    SIZE_T                  cbScratch )
+
+FUNCTION_START(SymCryptFdefModAdd256Asm, 4, 11)
+
+        // Q1 = pmMod
+        // Q2 = peSrc1
+        // Q3 = peSrc2
+        // Q4 = peDst
+
+        // compute Src1 + Src2 into (Q0, Q5, Q6, Q7) with carry out mask in Q8
+
+        mov     Q0, [Q2]
+        add     Q0, [Q3 ]
+        mov     Q5, [Q2 + 8]
+        adc     Q5, [Q3 + 8]
+        mov     Q6, [Q2 + 16]
+        adc     Q6, [Q3 + 16]
+        mov     Q7, [Q2 + 24]
+        adc     Q7, [Q3 + 24]
+        sbb     Q8, Q8                  // Q8 = carry out mask
+
+        // Q2, Q3: free
+        // Compute sum - Mod into (Q2, Q3, Q9, Q10) = sum - modulus, Q1 = carry out mask
+
+        add     Q1, SymCryptModulusValueOffsetAmd64
+
+        mov     Q2, Q0
+        sub     Q2, [Q1]
+        mov     Q3, Q5
+        sbb     Q3, [Q1 + 8]
+        mov     Q9, Q6
+        sbb     Q9, [Q1 + 16]
+        mov     Q10, Q7
+        sbb     Q10, [Q1 + 24]
+
+        sbb     Q1, Q1                 // Q1 = carry out mask
+
+        // Choose between the two
+        // addition carry = 1, then subtraction carry = 1 and we pick the 2nd result.
+        // addition carry = 0 and subtraction carry = 0: pick 2nd result
+        // addition carry = 0 and subtraction carry = 1: pick first result
+
+        xor     Q1, Q8            // 0 = 2nd result, 1 = first result
+
+        xor     Q0, Q2
+        xor     Q5, Q3
+        xor     Q6, Q9
+        xor     Q7, Q10
+
+        and     Q0, Q1
+        and     Q5, Q1
+        and     Q6, Q1
+        and     Q7, Q1
+
+        xor     Q2, Q0
+        xor     Q3, Q5
+        xor     Q9, Q6
+        xor     Q10, Q7
+
+        mov     [Q4 +  0], Q2
+        mov     [Q4 +  8], Q3
+        mov     [Q4 + 16], Q9
+        mov     [Q4 + 24], Q10
+
+FUNCTION_END(SymCryptFdefModAdd256Asm)
+
+
+FUNCTION_START(SymCryptFdefModSub256Asm, 4, 10)
+
+        // Q1 = pmMod
+        // Q2 = peSrc1
+        // Q3 = peSrc2
+        // Q4 = peDst
+
+        // compute Src1 - Src2 into (Q0, Q5, Q6, Q7) with carry out mask in Q8
+
+        mov     Q0, [Q2]
+        sub     Q0, [Q3]
+        mov     Q5, [Q2 + 8]
+        sbb     Q5, [Q3 + 8]
+        mov     Q6, [Q2 + 16]
+        sbb     Q6, [Q3 + 16]
+        mov     Q7, [Q2 + 24]
+        sbb     Q7, [Q3 + 24]
+        sbb     Q8, Q8                  // Q8 = carry out mask
+
+        // Q2, Q3: free
+        // Load Mod into (Q2, Q3, Q9, Q1)
+
+        add     Q1, SymCryptModulusValueOffsetAmd64
+
+        mov     Q2, [Q1]
+        mov     Q3, [Q1 + 8]
+        mov     Q9, [Q1 + 16]
+        mov     Q1, [Q1 + 24]
+
+        // Mask the value to be added to zero if there was no underflow
+        and     Q2, Q8
+        and     Q3, Q8
+        and     Q9, Q8
+        and     Q1, Q8
+
+        // Add the (masked) modulus
+        add     Q0, Q2
+        adc     Q5, Q3
+        adc     Q6, Q9
+        adc     Q7, Q1
+
+        mov     [Q4 +  0], Q0
+        mov     [Q4 +  8], Q5
+        mov     [Q4 + 16], Q6
+        mov     [Q4 + 24], Q7
+
+FUNCTION_END(SymCryptFdefModSub256Asm)
+
+//=================================================
+// Multiplication
+//
+
+#if defined(SYMCRYPT_MASM)
+altentry SymCryptFdefMontgomeryReduce256AsmInternal
+#endif
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefModMulMontgomery256Asm(
+//    _In_                            PCSYMCRYPT_MODULUS      pMod,
+//    _In_                            PCSYMCRYPT_MODELEMENT   pSrc1,
+//    _In_                            PCSYMCRYPT_MODELEMENT   pSrc2,
+//    _Out_                           PSYMCRYPT_MODELEMENT    pDst,
+//    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
+//                                    SIZE_T                  cbScratch )
+
+// Note we specify only 4 arguments as we never use arguments 5 and 6 (saves some prolog code in MSFT calling convention)
+MUL_FUNCTION_START(SymCryptFdefModMulMontgomery256Asm, 4, 14)
+
+        // Q1 = pMod
+        // Q2 = pSrc1
+        // Q3 = pSrc2
+        // Q4 = pDst
+
+        // First we compute the product. The result will be in 8 registers
+        //       Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13
+
+        mov     Q5, [Q2]
+        xor     Q8, Q8
+        xor     Q9, Q9
+        xor     Q10, Q10
+
+        mov     Q0, [Q3]
+        mul     Q5
+        mov     Q6, Q0
+        mov     Q7, QH
+
+        mov     Q0, [Q3 + 8]
+        mul     Q5
+        add     Q7, Q0
+        adc     Q8, QH
+
+        mov     Q0, [Q3 + 16]
+        mul     Q5
+        add     Q8, Q0
+        adc     Q9, QH
+
+        mov     Q0, [Q3 + 24]
+        mul     Q5
+        add     Q9, Q0
+        adc     Q10, QH
+
+        // Second row
+        mov     Q5, [Q2 + 8]
+        MUL14   Q0, QH, Q5, Q3, Q7, Q8, Q9, Q10, Q13
+        mov     Q11, QH
+
+        // third row
+        mov     Q5, [Q2 + 16]
+        MUL14   Q0, QH, Q5, Q3, Q8, Q9, Q10, Q11, Q13
+        mov     Q12, QH
+
+        // fourth row
+        mov     Q5, [Q2 + 24]
+        MUL14   Q0, QH, Q5, Q3, Q9, Q10, Q11, Q12, Q13
+        mov     Q13, QH
+
+ALTERNATE_ENTRY(SymCryptFdefMontgomeryReduce256AsmInternal)
+        // Invariant:
+        //   common prologue used
+        //   512-bit result in (Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13)
+        //   Q1 = pmMod
+        //   Q4 = pDst
+
+        mov     Q3, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]      // inv64
+        add     Q1, SymCryptModulusValueOffsetAmd64
+
+        mov     Q5, Q6
+        imul    Q5, Q3             // lower word is the same for signed & unsigned multiply - Q5 = multiplicand for first row
+        MONTGOMERY14 Q0, QH, Q5, Q1, Q6, Q7, Q8, Q9, Q6
+        mov     Q6, QH            // Save the out carries in (eventually) (Q6, Q7, Q8, Q9)
+
+        mov     Q5, Q7
+        imul    Q5, Q3
+        MONTGOMERY14 Q0, QH, Q5, Q1, Q7, Q8, Q9, Q10, Q7
+        mov     Q7, QH            // Save the out carries in (eventually) (Q6, Q7, Q8, Q9)
+
+        mov     Q5, Q8
+        imul    Q5, Q3
+        MONTGOMERY14 Q0, QH, Q5, Q1, Q8, Q9, Q10, Q11, Q8
+        mov     Q8, QH
+
+        mov     Q5, Q9
+        imul    Q5, Q3
+        MONTGOMERY14 Q0, QH, Q5, Q1, Q9, Q10, Q11, Q12, Q9
+        // mov   Q9, QH
+
+        add     Q10, Q6
+        adc     Q11, Q7
+        adc     Q12, Q8
+        adc     Q13, QH
+
+        sbb     Q5, Q5        // Carry out from final addition in mask form
+
+        // reduced value in (Q10, Q11, Q12, Q13, -Q5), and it is less than 2*Modulus
+
+        mov     Q6, Q10
+        sub     Q6, [Q1]
+        mov     Q7, Q11
+        sbb     Q7, [Q1 + 8]
+        mov     Q8, Q12
+        sbb     Q8, [Q1 + 16]
+        mov     Q9, Q13
+        sbb     Q9, [Q1 + 24]
+
+        sbb     Q1, Q1        // Q1 = carry out mask
+
+        // Choose between the two
+        // addition carry = 1, then subtraction carry = 1 and we pick the 2nd result.
+        // addition carry = 0 and subtraction carry = 0: pick 2nd result
+        // addition carry = 0 and subtraction carry = 1: pick first result
+
+        xor     Q1, Q5        // 0 = 2nd result, 1 = first result
+
+        xor     Q10, Q6
+        xor     Q11, Q7
+        xor     Q12, Q8
+        xor     Q13, Q9
+
+        and     Q10, Q1
+        and     Q11, Q1
+        and     Q12, Q1
+        and     Q13, Q1
+
+        xor     Q6, Q10
+        xor     Q7, Q11
+        xor     Q8, Q12
+        xor     Q9, Q13
+
+        mov     [Q4 +  0], Q6
+        mov     [Q4 +  8], Q7
+        mov     [Q4 + 16], Q8
+        mov     [Q4 + 24], Q9
+
+MUL_FUNCTION_END(SymCryptFdefModMulMontgomery256Asm)
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefMontgomeryReduce256Asm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+
+// Note we specify 4 arguments so that our prolog matches SymCryptFdefModMulMontgomery256Asm
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce256Asm, 4, 14)
+
+        mov     Q4, Q3
+        mov     Q6,  [Q2 +  0]
+        mov     Q7,  [Q2 +  8]
+        mov     Q8,  [Q2 + 16]
+        mov     Q9,  [Q2 + 24]
+        mov     Q10, [Q2 + 32]
+        mov     Q11, [Q2 + 40]
+        mov     Q12, [Q2 + 48]
+        mov     Q13, [Q2 + 56]
+
+        // Normal code doesn't jump from the body of one function to the body of another function.
+        // Here we have ensured that our stack frames are identical, so it is safe.
+        // We just have to convince the other system components that this works...
+
+        // Use conditional jump so that stack unwinder doesn't think it is an epilogue
+        test    rsp,rsp
+        jne     SymCryptFdefMontgomeryReduce256AsmInternal       // jumps always
+
+        int     3       // Dummy instruction because the debugger seems to have an off-by-one
+                        // error and still see the (wrong) epilogue when on the JNE instruction
+                        // Best guess: the debugger starts the stack trace *after* the current instruction
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce256Asm)
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefModSquareMontgomery256(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PCSYMCRYPT_MODELEMENT   peSrc,
+//    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
+//    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
+//                                    SIZE_T                  cbScratch )
+
+// Note we specify 4 arguments so that our prolog matches SymCryptFdefModMulMontgomery256Asm
+MUL_FUNCTION_START(SymCryptFdefModSquareMontgomery256Asm, 4, 14)
+
+        //  Result in   Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13
+
+        // Q1 = pmMod
+        // Q2 = peSrc
+        // Q3 = peDst
+
+        mov     Q4, Q3
+        mov     Q5, [Q2]
+        xor     Q9, Q9
+        xor     Q10, Q10
+        xor     Q11, Q11
+        xor     Q12, Q12
+
+        // First we compute all the terms that need doubling
+
+        mov     Q0, [Q2 + 8]
+        mul     Q5
+        mov     Q7, Q0
+        mov     Q8, QH
+
+        mov     Q0, [Q2 + 16]
+        mul     Q5
+        add     Q8, Q0
+        adc     Q9, QH
+
+        mov     Q0, [Q2 + 24]
+        mul     Q5
+        add     Q9, Q0
+        adc     Q10, QH
+
+        mov     Q5, [Q2 + 8]
+        mov     Q0, [Q2 + 16]
+        mul     Q5
+        add     Q9, Q0
+        adc     QH, 0
+        mov     Q13, QH
+
+        mov     Q0, [Q2 + 24]
+        mul     Q5
+        add     Q10, Q0
+        adc     QH, 0
+        add     Q10, Q13
+        adc     Q11, QH
+
+        mov     Q5, [Q2 + 16]
+        mov     Q0, [Q2 + 24]
+        mul     Q5
+        add     Q11, Q0
+        adc     Q12, QH        // no overflow from this
+
+        // double these terms
+        xor     Q13, Q13
+
+        add     Q7, Q7
+        adc     Q8, Q8
+        adc     Q9, Q9
+        adc     Q10, Q10
+        adc     Q11, Q11
+        adc     Q12, Q12
+        adc     Q13, 0
+
+        mov     Q0, [Q2]
+        mul     Q0
+        mov     Q6, Q0
+        mov     Q5, QH
+
+        mov     Q0, [Q2 + 8]
+        mul     Q0
+
+        add     Q7, Q5
+        adc     Q8, Q0
+        adc     Q9, QH
+        sbb     Q3, Q3          // -carry
+
+        mov     Q0, [Q2 + 16]
+        mul     Q0
+
+        add     Q3, Q3
+        adc     Q10, Q0
+        adc     Q11, QH
+        sbb     Q3, Q3
+
+        mov     Q0, [Q2 + 24]
+        mul     Q0
+        add     Q3, Q3
+        adc     Q12, Q0
+        adc     Q13, QH
+
+        // See SymCryptFdefMontgomeryReduce256Asm for a discussion of this strange epilogue sequence
+        test    rsp,rsp
+        jne     SymCryptFdefMontgomeryReduce256AsmInternal       // jumps always
+
+        int     3
+
+MUL_FUNCTION_END(SymCryptFdefModSquareMontgomery256Asm)
+
+// --------------------------------
+// 512-bit size specific functions
+// --------------------------------
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMul512Asm(
+//    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
+//    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
+//                                                        UINT32      nDigits,
+//    _Out_writes_(2*nWords)                              PUINT32     pDst )
+MUL_FUNCTION_START(SymCryptFdefRawMul512Asm, 4, 8)
+
+        // Basic structure:
+        //   for each word in Src1:
+        //       Dst += Src2 * word
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = pSrc1  (updated in outer loop)
+        // Q2 = pSrc2 (constant)
+        // Q3 = # words left from Src1 to process
+        // Q4 = pDst (incremented in outer loop)
+        // Q5 = word from Src1 to multiply with
+        // Q6 = carry for even words (64 bits)
+        // Q7 = carry for odd words (64 bits)
+
+        shl     Q3, 3               // nDigits * 8 = # words in Src1 to process
+
+        mov     Q5, [Q1]            // mulword
+        xor     Q6, Q6              // carry
+
+        // First inner loop overwrites Dst, which avoids adding the current Dst value
+        MULT_SINGLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        mov     [Q4 + 64], Q6       // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q3
+
+ALIGN(16)
+
+SymCryptFdefRawMul512AsmLoopOuter:
+
+        lea     Q1, [Q1 + 8]        // move to next word of pSrc1
+        lea     Q4, [Q4 + 8]        // move Dst pointer one word over
+
+        mov     Q5, [Q1]            // mulword
+        xor     Q6, Q6              // carry
+
+        MULT_DOUBLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        mov     [Q4 + 64], Q6       // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q3
+        jnz     SymCryptFdefRawMul512AsmLoopOuter
+
+MUL_FUNCTION_END(SymCryptFdefRawMul512Asm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareAsm(
+//   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
+//                                                       UINT32      nDigits,
+//   _Out_writes_(2*nWords)                              PUINT32     pDst )
+MUL_FUNCTION_START(SymCryptFdefRawSquare512Asm, 3, 11)
+
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = outer loop pointer into pSrc
+        // Q2 = nDigits (constant)
+        // Q3 = pDst (constant)
+        // Q4 = inner loop pointer into pSrc
+        // Q5 = inner loop pointer into pDst
+        // Q6 = word from Src to multiply with
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = outer loop pointer into pDst
+        // Q10 = outer loop counter of #words left
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1   // save pSrc
+
+        ////////////////////////////////////////////////////////////////
+        // First Pass - Addition of the cross products x_i*x_j with i!=j
+        ////////////////////////////////////////////////////////////////
+
+        mov     Q9, Q3              // Q9 = outer pDst
+
+        mov     Q4, Q1              // Q4 = inner pSrc
+        mov     Q5, Q3              // Q5 = inner pDst
+
+        // Initial inner loop overwrites Dst, which avoids adding the current Dst value
+        // 7 iterations
+        xor     Q8, Q8              // carry = 0 (for "odd" iterations set only the Q8 carry)
+        mov     Q6, [Q1]            // mulword
+        mov     [Q5], Q8            // Write 0 in the first word
+
+        SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        SQR_SINGLEADD_64 4, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 5, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 6, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 7, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        mov     [Q5 + 8*8], Q7      // write last word, cannot overflow because Dst is at least 2 digits long
+        add     Q9, 8               // Skip over the first word
+
+        // 6 iterations
+        xor     Q7, Q7              // carry = 0 (for "even" iterations set only the Q7 carry)
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 6*8], Q7
+
+        // 5 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7  // Notice the dst_carry is Q7 since all the "double" macros have Q7 as src_carry
+        SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 5*8], Q7
+
+        // 4 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 4*8], Q7
+
+        // 3 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 3*8], Q7
+
+        // 2 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 2*8], Q7
+
+        // 1 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        mov     [Q5 + 8], Q7
+
+        xor     QH, QH
+        mov     [Q5 + 16], QH      // Final word = 0
+
+
+        ////////////////////////////////////////////////////////////////
+        // Second Pass - Shifting all results 1 bit left
+        ////////////////////////////////////////////////////////////////
+
+        xor     Q0, Q0              // carry flag = 0
+        mov     Q5, Q3              // pDst pointer
+
+        SQR_SHIFT_LEFT 0, Q0, Q5
+        SQR_SHIFT_LEFT 1, Q0, Q5
+        SQR_SHIFT_LEFT 2, Q0, Q5
+        SQR_SHIFT_LEFT 3, Q0, Q5
+
+        SQR_SHIFT_LEFT 4, Q0, Q5
+        SQR_SHIFT_LEFT 5, Q0, Q5
+        SQR_SHIFT_LEFT 6, Q0, Q5
+        SQR_SHIFT_LEFT 7, Q0, Q5
+
+        SQR_SHIFT_LEFT 8, Q0, Q5
+        SQR_SHIFT_LEFT 9, Q0, Q5
+        SQR_SHIFT_LEFT 10, Q0, Q5
+        SQR_SHIFT_LEFT 11, Q0, Q5
+
+        SQR_SHIFT_LEFT 12, Q0, Q5
+        SQR_SHIFT_LEFT 13, Q0, Q5
+        SQR_SHIFT_LEFT 14, Q0, Q5
+        SQR_SHIFT_LEFT 15, Q0, Q5
+
+        //////////////////////////////////////////////////////////////////////////////
+        // Third Pass - Adding the squares on the even columns and propagating the sum
+        //////////////////////////////////////////////////////////////////////////////
+
+        mov     Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)]  // Q1 = pSrc
+        xor     Q7, Q7
+
+        SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q7
+
+MUL_FUNCTION_END(SymCryptFdefRawSquare512Asm)
+
+//VOID
+//SymCryptFdefMontgomeryReduce512Asm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce512Asm, 3, 13)
+
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]                   // nDigits
+        mov     Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]           // inv64
+
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                     // modulus value
+
+        mov     D12, D4         // outer loop counter
+        shl     D12, 3          // D12 is in words
+
+        xor     D9, D9
+
+        // General register allocations
+        // Q0 = multiply result
+        // QH = multiply result
+        // Q1 = pointer to modulus value
+        // Q2 = pSrc (updated in outer loop)
+        // Q3 = pDst
+        // D4 = nDigits
+        // Q5 = pmMod->tm.montgomery.inv64
+        // Q6 = multiplier in inner loop
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = carry out from last word of previous loop iteration
+        // Q10 = running pointer in Src
+        // Q11 = running pointer in Mod
+        // D12 = loop counter
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduce512AsmOuterLoop:
+
+        // start decoder with a few simple instructions, including at least one that requires
+        // a uop execution and is on the critical path
+
+        mov     Q6, [Q2]                        // fetch word of Src we want to set to zero
+        mov     Q11, Q2
+        mov     Q10, Q1
+
+        imul    Q6, Q5                          // lower word is same for signed & unsigned multiply
+
+        xor     D7, D7
+
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // Q1 = pointer to modulus value
+        // Q6 = multiplier
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q10 = running ptr to modulus
+        // Q11 = running ptr to input/scratch
+        // D12 = outer loop counter (words)
+
+        MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8
+
+        lea     Q11,[Q11 + 64]
+
+        add     Q7, Q9
+        mov     D9, 0
+        adc     Q9, 0
+        add     Q7, [Q11]
+        adc     Q9, 0
+        mov     [Q11], Q7
+
+        lea     Q2,[Q2 + 8]
+
+        dec     D12
+        jnz     SymCryptFdefMontgomeryReduce512AsmOuterLoop
+
+        //
+        // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
+        //
+
+        // First we compute the pSrc result minus the modulus into the destination
+        mov     Q11, Q2         // pSrc
+        mov     Q10, Q1         // pMod
+        mov     Q7, Q3          // pDst
+
+        // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2
+        mov     Q0,[Q11]
+        sbb     Q0,[Q10]
+        mov     [Q7], Q0
+
+        mov     Q0,[Q11 + 8]
+        sbb     Q0,[Q10 + 8]
+        mov     [Q7 + 8], Q0
+
+        mov     Q0,[Q11 + 16]
+        sbb     Q0,[Q10 + 16]
+        mov     [Q7 + 16], Q0
+
+        mov     Q0,[Q11 + 24]
+        sbb     Q0,[Q10 + 24]
+        mov     [Q7 + 24], Q0
+
+        mov     Q0,[Q11 + 32]
+        sbb     Q0,[Q10 + 32]
+        mov     [Q7 + 32], Q0
+
+        mov     Q0,[Q11 + 40]
+        sbb     Q0,[Q10 + 40]
+        mov     [Q7 + 40], Q0
+
+        mov     Q0,[Q11 + 48]
+        sbb     Q0,[Q10 + 48]
+        mov     [Q7 + 48], Q0
+
+        mov     Q0,[Q11 + 56]
+        sbb     Q0,[Q10 + 56]
+        mov     [Q7 + 56], Q0
+
+        lea     Q11,[Q11 + 64]
+        lea     Q10,[Q10 + 64]
+        lea     Q7,[Q7 + 64]
+
+        // Finally a masked copy from pSrc to pDst
+        // copy if: Q9 == 0 && Cy = 1
+        sbb     D9, 0
+
+        movd    xmm0, D9            // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop:
+        movdqa  xmm2, [Q2]          // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3]          // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3], xmm2
+
+        movdqa  xmm2, [Q2 + 16]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 16]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 16], xmm2
+
+        movdqa  xmm2, [Q2 + 32]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 32]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 32], xmm2
+
+        movdqa  xmm2, [Q2 + 48]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 48]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 48], xmm2
+
+        // Move on to the next digit
+        lea     Q2,[Q2 + 64]
+        lea     Q3,[Q3 + 64]
+
+        dec     D4
+        jnz     SymCryptFdefMontgomeryReduce512AsmMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce512Asm)
+
+// --------------------------------
+// 1024-bit size specific functions
+// --------------------------------
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMul1024Asm(
+//    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
+//    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
+//                                                        UINT32      nDigits,
+//    _Out_writes_(2*nWords)                              PUINT32     pDst )
+MUL_FUNCTION_START(SymCryptFdefRawMul1024Asm, 4, 8)
+
+        // Basic structure:
+        //   for each word in Src1:
+        //       Dst += Src2 * word
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = pSrc1  (updated in outer loop)
+        // Q2 = pSrc2 (constant)
+        // Q3 = # words left from Src1 to process
+        // Q4 = pDst (incremented in outer loop)
+        // Q5 = word from Src1 to multiply with
+        // Q6 = carry for even words (64 bits)
+        // Q7 = carry for odd words (64 bits)
+
+        shl     Q3, 3               // nDigits * 8 = # words in Src1 to process
+
+        mov     Q5, [Q1]          // mulword
+        xor     Q6, Q6              // carry
+
+        // First inner loop overwrites Dst, which avoids adding the current Dst value
+        MULT_SINGLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        MULT_SINGLEADD_128 8, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 10, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 12, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_SINGLEADD_128 14, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        mov     [Q4 + 128], Q6          // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q3
+
+ALIGN(16)
+
+SymCryptFdefRawMul1024AsmLoopOuter:
+
+        lea     Q1, [Q1 + 8]        // move to next word of pSrc1
+        lea     Q4, [Q4 + 8]        // move Dst pointer one word over
+
+        mov     Q5, [Q1]            // mulword
+
+        xor     Q6, Q6              // carry
+
+        MULT_DOUBLEADD_128 0, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 2, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 4, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 6, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        MULT_DOUBLEADD_128 8, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 10, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 12, Q2, Q4, Q0, QH, Q5, Q6, Q7
+        MULT_DOUBLEADD_128 14, Q2, Q4, Q0, QH, Q5, Q6, Q7
+
+        mov     [Q4 + 128], Q6    // write last word, cannot overflow because Dst is at least 2 digits long
+
+        dec     Q3
+        jnz     SymCryptFdefRawMul1024AsmLoopOuter
+
+MUL_FUNCTION_END(SymCryptFdefRawMul1024Asm)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareAsm(
+//   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
+//                                                       UINT32      nDigits,
+//   _Out_writes_(2*nWords)                              PUINT32     pDst )
+MUL_FUNCTION_START(SymCryptFdefRawSquare1024Asm, 3, 11)
+
+        // Register assignments
+        //
+        // Q0 = tmp for mul
+        // QH = tmp for mul
+        // Q1 = outer loop pointer into pSrc
+        // Q2 = nDigits (constant)
+        // Q3 = pDst (constant)
+        // Q4 = inner loop pointer into pSrc
+        // Q5 = inner loop pointer into pDst
+        // Q6 = word from Src to multiply with
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = outer loop pointer into pDst
+        // Q10 = outer loop counter of #words left
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1   // save pSrc
+
+        ////////////////////////////////////////////////////////////////
+        // First Pass - Addition of the cross products x_i*x_j with i!=j
+        ////////////////////////////////////////////////////////////////
+
+        mov     Q9, Q3              // Q9 = outer pDst
+
+        mov     Q4, Q1              // Q4 = inner pSrc
+        mov     Q5, Q3              // Q5 = inner pDst
+
+        // Initial inner loop overwrites Dst, which avoids adding the current Dst value
+
+        // 15 iterations
+        xor     Q8, Q8              // carry = 0 (for "odd" iterations set only the Q8 carry)
+        mov     Q6, [Q1]            // mulword
+        mov     [Q5], Q8            // Write 0 in the first word
+
+        SQR_SINGLEADD_64 1, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 3, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        SQR_SINGLEADD_64 4, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 5, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 6, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 7, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        SQR_SINGLEADD_64 8, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 9, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 10, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 11, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        SQR_SINGLEADD_64 12, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 13, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_SINGLEADD_64 14, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_SINGLEADD_64 15, Q4, Q5, Q0, QH, Q6, Q8, Q7
+
+        mov     [Q5 + 16*8], Q7     // write last word, cannot overflow because Dst is at least 2 digits long
+        add     Q9, 8               // Skip over the first word
+
+        // 14 iterations (adding the current Dst value)
+        xor     Q7, Q7            // carry = 0 (for "even" iterations set only the Q7 carry)
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_8 6, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 14*8], Q7
+
+        // 13 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7  // Notice the dst_carry is Q7 since all the "double" macros have Q7 as src_carry
+        SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_8 5, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 13*8], Q7
+
+        // 12 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_8 4, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 12*8], Q7
+
+        // 11 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_8 3, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 11*8], Q7
+
+        // 10 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_8 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 10*8], Q7
+
+        // 9 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_8 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 9*8], Q7
+
+        // 8 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_8 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 8*8], Q7
+
+        // 7 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_4 3, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 7*8], Q7
+
+        // 6 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        SQR_DOUBLEADD_64_4 2, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 6*8], Q7
+
+        // 5 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_4 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 5*8], Q7
+
+        // 4 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_4 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 4*8], Q7
+
+        // 3 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        SQR_DOUBLEADD_64_2 1, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 3*8], Q7
+
+        // 2 iterations
+        xor     Q7, Q7
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64_2 0, Q4, Q5, Q0, QH, Q6, Q7, Q8
+        mov     [Q5 + 2*8], Q7
+
+        // 1 iterations
+        xor     Q8, Q8
+        SQR_SIZE_SPECIFIC_INIT Q1, Q9, Q4, Q5, Q6
+        SQR_DOUBLEADD_64 0, Q4, Q5, Q0, QH, Q6, Q8, Q7
+        mov     [Q5 + 8], Q7
+
+        xor     QH, QH
+        mov     [Q5 + 16], QH       // Final word = 0
+
+
+        ////////////////////////////////////////////////////////////////
+        // Second Pass - Shifting all results 1 bit left
+        ////////////////////////////////////////////////////////////////
+
+        xor     Q0, Q0              // carry flag = 0
+        mov     Q5, Q3              // pDst pointer
+
+        SQR_SHIFT_LEFT 0, Q0, Q5
+        SQR_SHIFT_LEFT 1, Q0, Q5
+        SQR_SHIFT_LEFT 2, Q0, Q5
+        SQR_SHIFT_LEFT 3, Q0, Q5
+
+        SQR_SHIFT_LEFT 4, Q0, Q5
+        SQR_SHIFT_LEFT 5, Q0, Q5
+        SQR_SHIFT_LEFT 6, Q0, Q5
+        SQR_SHIFT_LEFT 7, Q0, Q5
+
+        SQR_SHIFT_LEFT 8, Q0, Q5
+        SQR_SHIFT_LEFT 9, Q0, Q5
+        SQR_SHIFT_LEFT 10, Q0, Q5
+        SQR_SHIFT_LEFT 11, Q0, Q5
+
+        SQR_SHIFT_LEFT 12, Q0, Q5
+        SQR_SHIFT_LEFT 13, Q0, Q5
+        SQR_SHIFT_LEFT 14, Q0, Q5
+        SQR_SHIFT_LEFT 15, Q0, Q5
+
+        SQR_SHIFT_LEFT 16, Q0, Q5
+        SQR_SHIFT_LEFT 17, Q0, Q5
+        SQR_SHIFT_LEFT 18, Q0, Q5
+        SQR_SHIFT_LEFT 19, Q0, Q5
+
+        SQR_SHIFT_LEFT 20, Q0, Q5
+        SQR_SHIFT_LEFT 21, Q0, Q5
+        SQR_SHIFT_LEFT 22, Q0, Q5
+        SQR_SHIFT_LEFT 23, Q0, Q5
+
+        SQR_SHIFT_LEFT 24, Q0, Q5
+        SQR_SHIFT_LEFT 25, Q0, Q5
+        SQR_SHIFT_LEFT 26, Q0, Q5
+        SQR_SHIFT_LEFT 27, Q0, Q5
+
+        SQR_SHIFT_LEFT 28, Q0, Q5
+        SQR_SHIFT_LEFT 29, Q0, Q5
+        SQR_SHIFT_LEFT 30, Q0, Q5
+        SQR_SHIFT_LEFT 31, Q0, Q5
+
+        //////////////////////////////////////////////////////////////////////////////
+        // Third Pass - Adding the squares on the even columns and propagating the sum
+        //////////////////////////////////////////////////////////////////////////////
+
+        mov     Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)]  // Q1 = pSrc
+        xor     Q7, Q7
+
+        SQR_DIAGONAL_PROP 0, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 1, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 2, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 3, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 4, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 5, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 6, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 7, Q1, Q3, Q0, QH, Q7
+
+        SQR_DIAGONAL_PROP 8, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 9, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 10, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 11, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 12, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 13, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 14, Q1, Q3, Q0, QH, Q7
+        SQR_DIAGONAL_PROP 15, Q1, Q3, Q0, QH, Q7
+
+MUL_FUNCTION_END(SymCryptFdefRawSquare1024Asm)
+
+//VOID
+//SymCryptFdefMontgomeryReduce1024Asm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _In_                            PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduce1024Asm, 3, 13)
+
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]                    // nDigits
+        mov     Q5, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]            // inv64
+
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                      // modulus value
+
+        mov     D12, D4         // outer loop counter
+        shl     D12, 3          // D12 is in words
+
+        xor     D9, D9
+
+        // General register allocations
+        // Q0 = multiply result
+        // QH = multiply result
+        // Q1 = pointer to modulus value
+        // Q2 = pSrc (updated in outer loop)
+        // Q3 = pDst
+        // D4 = nDigits
+        // Q5 = pmMod->tm.montgomery.inv64
+        // Q6 = multiplier in inner loop
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q9 = carry out from last word of previous loop iteration
+        // Q10 = running pointer in Src
+        // Q11 = running pointer in Mod
+        // D12 = loop counter
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduce1024AsmOuterLoop:
+
+        // start decoder with a few simple instructions, including at least one that requires
+        // a uop execution and is on the critical path
+
+        mov     Q6, [Q2]                      // fetch word of Src we want to set to zero
+        mov     Q11, Q2
+        mov     Q10, Q1
+
+        imul    Q6, Q5                        // lower word is same for signed & unsigned multiply
+
+        xor     D7, D7
+
+        // Q0 = mul scratch
+        // QH = mul scratch
+        // Q1 = pointer to modulus value
+        // Q6 = multiplier
+        // Q7 = carry for even words (64 bits)
+        // Q8 = carry for odd words (64 bits)
+        // Q10  = running ptr to modulus
+        // Q11 = running ptr to input/scratch
+        // D12 = outer loop counter (words)
+
+        MULT_DOUBLEADD_128 0, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 2, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 4, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 6, Q10, Q11, Q0, QH, Q6, Q7, Q8
+
+        MULT_DOUBLEADD_128 8, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 10, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 12, Q10, Q11, Q0, QH, Q6, Q7, Q8
+        MULT_DOUBLEADD_128 14, Q10, Q11, Q0, QH, Q6, Q7, Q8
+
+        lea     Q11,[Q11 + 128]
+
+        add     Q7, Q9
+        mov     D9, 0
+        adc     Q9, 0
+        add     Q7, [Q11]
+        adc     Q9, 0
+        mov     [Q11], Q7
+
+        lea     Q2,[Q2 + 8]
+
+        dec     D12
+        jnz     SymCryptFdefMontgomeryReduce1024AsmOuterLoop
+
+        //
+        // Most of the work is done - now all that is left is subtract the modulus if it is smaller than the result
+        //
+
+        // First we compute the pSrc result minus the modulus into the destination
+        mov     D12, D4         // loop ctr
+        mov     Q11, Q2         // pSrc
+        mov     Q10, Q1         // pMod
+        mov     Q7, Q3          // pDst
+
+        // Cy = 0 because the last 'adc Q9,0' resulted in 0, 1, or 2
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduce1024AsmSubLoop:
+        mov     Q0,[Q11]
+        sbb     Q0,[Q10]
+        mov     [Q7], Q0
+
+        mov     Q0,[Q11 + 8]
+        sbb     Q0,[Q10 + 8]
+        mov     [Q7 + 8], Q0
+
+        mov     Q0,[Q11 + 16]
+        sbb     Q0,[Q10 + 16]
+        mov     [Q7 + 16], Q0
+
+        mov     Q0,[Q11 + 24]
+        sbb     Q0,[Q10 + 24]
+        mov     [Q7 + 24], Q0
+
+        mov     Q0,[Q11 + 32]
+        sbb     Q0,[Q10 + 32]
+        mov     [Q7 + 32], Q0
+
+        mov     Q0,[Q11 + 40]
+        sbb     Q0,[Q10 + 40]
+        mov     [Q7 + 40], Q0
+
+        mov     Q0,[Q11 + 48]
+        sbb     Q0,[Q10 + 48]
+        mov     [Q7 + 48], Q0
+
+        mov     Q0,[Q11 + 56]
+        sbb     Q0,[Q10 + 56]
+        mov     [Q7 + 56], Q0
+
+        lea     Q11,[Q11 + 64]
+        lea     Q10,[Q10 + 64]
+        lea     Q7,[Q7 + 64]
+
+        dec     D12
+        jnz     SymCryptFdefMontgomeryReduce1024AsmSubLoop
+
+        // Finally a masked copy form pSrc to pDst
+        // copy if: Q9 == 0 && Cy = 1
+        sbb     D9, 0
+
+        movd    xmm0, D9            // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+ALIGN(16)
+
+SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop:
+        movdqa  xmm2, [Q2]          // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3]          // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3], xmm2
+
+        movdqa  xmm2, [Q2 + 16]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 16]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 16], xmm2
+
+        movdqa  xmm2, [Q2 + 32]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 32]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 32], xmm2
+
+        movdqa  xmm2, [Q2 + 48]     // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 48]     // xmm3 = pDst[0]
+        pand    xmm2, xmm0          //
+        pand    xmm3, xmm1          //
+        por     xmm2, xmm3
+        movdqa  [Q3 + 48], xmm2
+
+        // Move on to the next digit
+        lea     Q2,[Q2 + 64]
+        lea     Q3,[Q3 + 64]
+
+        dec     D4
+        jnz     SymCryptFdefMontgomeryReduce1024AsmMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduce1024Asm)
+
+FILE_END()
diff --git a/lib/amd64/fdef_mul_macros.asm b/lib/amd64/fdef_mul_macros.asm
deleted file mode 100644
index 24ccd43..0000000
--- a/lib/amd64/fdef_mul_macros.asm
+++ /dev/null
@@ -1,224 +0,0 @@
-;
-; Macros for the multiplication routines in amd64
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-; General multiplication
-
-MULT_SINGLEADD_128   MACRO   index, src_reg, dst_reg
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     r15, rdx
-        add     rax, r12
-        mov     [dst_reg + 8*index], rax
-        adc     r15, 0
-
-        mov     rax, [src_reg + 8*(index+1)]
-        mul     rbx
-        mov     r12, rdx
-        add     rax, r15
-        mov     [dst_reg + 8*(index+1)], rax
-        adc     r12, 0
-
-    ENDM
-
-MULT_DOUBLEADD_128   MACRO   index, src_reg, dst_reg
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; r12 = carry for even words (64 bits)
-        ; r15 = carry for odd words (64 bits)
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     r15, rdx
-        add     rax, [dst_reg + 8*index]
-        adc     r15, 0
-        add     rax, r12
-        mov     [dst_reg + 8*index], rax
-        adc     r15, 0
-
-        mov     rax, [src_reg + 8*(index+1)]
-        mul     rbx
-        mov     r12, rdx
-        add     rax, [dst_reg + 8*(index+1)]
-        adc     r12, 0
-        add     rax, r15
-        mov     [dst_reg + 8*(index+1)], rax
-        adc     r12, 0
-
-    ENDM
-
-; Squaring
-
-SQR_SINGLEADD_64   MACRO   index, src_reg, dst_reg, src_carry, dst_carry
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; src_carry = input carry
-        ; dst_carry = output carry
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     dst_carry, rdx
-        add     rax, src_carry
-        mov     [dst_reg + 8*index], rax
-        adc     dst_carry, 0
-
-    ENDM
-
-SQR_DOUBLEADD_64   MACRO   index, src_reg, dst_reg, src_carry, dst_carry
-        ; rax = mul scratch
-        ; rbx = multiplier
-        ; rdx = mul scratch
-        ; src_reg = running ptr to input
-        ; dst_reg = running ptr to output/scratch
-        ; src_carry = input carry
-        ; dst_carry = output carry
-
-        mov     rax, [src_reg + 8*index]
-        mul     rbx
-        mov     dst_carry, rdx
-        add     rax, [dst_reg + 8*index]
-        adc     dst_carry, 0
-        add     rax, src_carry
-        mov     [dst_reg + 8*index], rax
-        adc     dst_carry, 0
-
-    ENDM
-
-SQR_SHIFT_LEFT MACRO index
-        mov     rax, [rdi + 8*index]
-        adc     rax, rax            ; Shift let and add the carry
-        mov     [rdi + 8*index], rax
-    ENDM
-
-SQR_DIAGONAL_PROP MACRO index
-        ;;;;;;;;;;;;;;;;;;;;;;;;
-        ; Calculating the square
-        mov     rax, [rsi + 8*index]    ; mulword
-        mul     rax                     ; m^2
-
-        ; Adding the square to the even column
-        add     rax, [rdi + 16*index]
-        adc     rdx, 0
-        add     rax, r12
-        adc     rdx, 0
-        mov     [rdi + 16*index], rax
-
-        ; Propagating the sum to the next column
-        mov     rax, rdx
-        xor     rdx, rdx
-
-        add     rax, [rdi + 16*index + 8]
-        adc     rdx, 0
-        mov     [rdi + 16*index + 8], rax
-        mov     r12, rdx
-    ENDM
-
-; Size-specific macros
-; A common prologue & epilogue between several functions allows jumping between them...
-
-MULT_COMMON_PROLOGUE    MACRO
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-
-        END_PROLOGUE
-    ENDM
-
-MULT_COMMON_EPILOGUE    MACRO
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12
-        ret
-    ENDM
-
-
-MUL14   MACRO   Mult, pA, R0, R1, R2, R3, Cy
-        ; (R0, R1, R2, R3, rdx) = Mult * (A0..3) + (R0, R1, R2, R3)
-        ; Cy, rax = scratch
-
-        mov     rax, [pA]
-        mul     Mult
-        add     R0, rax
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 8]
-        mul     Mult
-        add     R1, rax
-        adc     rdx, 0
-        add     R1, Cy
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 16]
-        mul     Mult
-        add     R2, rax
-        adc     rdx, 0
-        add     R2, Cy
-        adc     rdx, 0
-        mov     Cy, rdx
-
-        mov     rax, [pA + 24]
-        mul     Mult
-        add     R3, rax
-        adc     rdx, 0
-        add     R3, Cy
-        adc     rdx, 0
-
-    ENDM
-
-; Macros for size-specific squaring
-
-SQR_DOUBLEADD_64_2  MACRO index
-        SQR_DOUBLEADD_64    (index),     rsi, rdi, r12, r15
-        SQR_DOUBLEADD_64    (index + 1), rsi, rdi, r15, r12
-    ENDM
-
-SQR_DOUBLEADD_64_4  MACRO index
-        SQR_DOUBLEADD_64_2  (index)
-        SQR_DOUBLEADD_64_2  (index + 2)
-    ENDM
-
-SQR_DOUBLEADD_64_8  MACRO index
-        SQR_DOUBLEADD_64_4  (index)
-        SQR_DOUBLEADD_64_4  (index + 4)
-    ENDM
-
-SQR_SIZE_SPECIFIC_INIT MACRO
-        lea     rcx, [rcx + 8]          ; move Src pointer 1 word over
-        lea     r10, [r10 + 16]         ; move Dst pointer 2 words over
-
-        mov     rsi, rcx                ; rsi = inner pSrc
-        mov     rdi, r10                ; rdi = inner pDst
-
-        mov     rbx, [rcx]              ; Get the next mulword
-        lea     rsi, [rsi + 8]          ; move Src pointer 1 word over
-    ENDM
\ No newline at end of file
diff --git a/lib/amd64/fdef_mulx.asm b/lib/amd64/fdef_mulx.asm
deleted file mode 100644
index 0165923..0000000
--- a/lib/amd64/fdef_mulx.asm
+++ /dev/null
@@ -1,1680 +0,0 @@
-;
-;  fdef_asm.asm   Assembler code for large integer arithmetic in the default data format
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-
-include ksamd64.inc
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-
-
-include C_asm_shared.inc
-
-; A digit consists of 8 words of 64 bits each
-    
-;The MULX/ADCX/ADOX instructions greatly speed up multi-precision arithmetic.
-;A set of MULX + ADCX + ADOX can implement a single 64x64->128 plus two 64-bit additions in a single clock cycle (throughput)
-;However, that speed puts pressure on other parts of the system.
-;
-;The code size for these three instructions is 18 cycles, whereas the pre-decoder on Broadwell reportedly can only 
-; load 16 bytes per cycle.
-;That means the pre-decoder need 9 cycles per 8 multiplications, plus one for the per-row-of-8 overhead, meaning we need 10
-;cycles for 8 multiplications. Except that I have measured the 18 bytes as taking 1 cycle each, so the decoder must have a
-; higher bandwidth.
-;
-;If we keep the code size small enough to fit in the uop cache, then the pre-decoder bottleneck goes away which should save us
-;8 cycles per 512x512 multiplication. 
-;
-;Code size for 512x512 is 64 multiplications at 18 bytes each = 36 cache lines of 32 bytes which need 72 uop cache lines that
-;each contain up to 6 uops. (Each 32-byte code cache line contains 7 or so uops, so the 6 uops per uop cache line isn't enough.)
-;The total uop cache is 256 lines, so we could fit 3+ copies of the 512x512 code.
-;
-;But we need the following:
-;- A core 512x512 multiplication in a loop
-;- Either zero the 8 carry registers up front (3 cycles), or have a separate 512x512 multiplication that sets up the carry registers.
-;    This latter is less code, alleviating the decoder bottleneck a bit.
-;- A 512x512 multiplication that computes the Montgovery multipliers in-line
-;- Code for squaring using MULX/ADX.
-;
-;The 512x512 unrolling is really necessary to get the performance; using 256x256 adds more overhead that we could gain back from the 
-;uop cache, and it uses more computations and will in general be slower.
-;
-;The full modexp loop also contains things like masked copies, ScsTable, etc. 
-;All in all, I don't see how we can keep all this inside the uop cache. 
-;Therefore, we will ignore the uop cache and optimize the code without it. 
-;
-;Basic bottlenecks:
-;- Pre-decoder at 16 bytes/cycle (turns out to be more...)
-;- Decoder which can decode 1-1-1-1, 2-1-1, 3-1 (although some sources claim it doesn't) and 4 per cycle
-;- One source claims that mulx takes 2 uops, and mulx with memory argument 3 uops which would limit the decoder throughput to
-;   require 2 cycles per mulx(mem)/adox/adcx triplet. 
-;
-; We have verified experimentally that on Broadwell, a sequence of 1024 triples of (MULX w/ memory operand, adox, adcx) runs
-; at 1 cycle per triple. As this code is too large for the uop cache, the pre-decoders and decoders are fast enough.
-; Adding a fourth instruction to the tuple makes it run at 2 cycles/tuple. 
-; This is consistent with:
-; - Pre-decoder is able to process at least 18 bytes per cycle
-; - Mulx is 1 uop, Mulx + memory read is 2 uops
-; - Decoder can produce 4 uops per cycle.
-;
-;Basic multiplication operation:
-;
-;    We have one set of macros that do 8 words times 1 word, leaving 8 words carry in registers
-;    8 of these 8x1 multiplications in sequence forms an 8x8, which is the inner loop body
-;    (First iteration is slightly differently and done first outside the loop)
-;    The inner loop iterates this to get an 8n * 8 multiplication
-;    The outer loop iterates this to get an 8n * 8m multiplication
-;
-;    Our bottleneck seems to be the pre-decoder which can only run 16 bytes of code each clock cycle.
-;    (The uop cache is too small to hold our square+multiply+montgomery reduction code.)
-;    Thus we don't use zero-output and then multiply-and-add, but rather have separate copies
-;    of the code for the first iteration to do multiply-without-add as that cuts down on the total amount of code
-;    we need, and with that reduces the pre-decoder usage.
-;
-
-MULADD18        MACRO    R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1
-        ; R0:R[7:1]:D[0] = A[7:0] * B[0] + D[0] + R[7:0]
-        ; Pre: Cy = Ov = 0
-        ; Post: Cy = Ov = 0
-
-        mov     rdx, [pB]
-        adox    R0, [pD]
-
-        mulx    T1, T0, [pA + 0 * 8]
-        adcx    R0, T0
-        adox    R1, T1
-
-        mulx    T1, T0, [pA + 1 * 8]
-        adcx    R1, T0
-        adox    R2, T1
-
-        mulx    T1, T0, [pA + 2 * 8]
-        adcx    R2, T0
-        adox    R3, T1
-
-        mulx    T1, T0, [pA + 3 * 8]
-        adcx    R3, T0
-        adox    R4, T1
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R4, T0
-        adox    R5, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R5, T0
-        adox    R6, T1
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R6, T0
-        adox    R7, T1
-
-        mulx    T1, T0, [pA + 7 * 8]
-        adcx    R7, T0
-        mov     [pD], R0
-
-        mov     R0, 0
-        adcx    R0, R0
-        adox    R0, T1
-
-    ENDM        ; MULADD18
-
-
-MULADD88        MACRO   R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1
-        ; pre & post: Cy = Ov = 0
-        ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
-        ; rdx is volatile
-
-        MULADD18    R0, R1, R2, R3, R4, R5, R6, R7, pD     , pA, pB     , T0, T1
-        MULADD18    R1, R2, R3, R4, R5, R6, R7, R0, pD +  8, pA, pB +  8, T0, T1
-        MULADD18    R2, R3, R4, R5, R6, R7, R0, R1, pD + 16, pA, pB + 16, T0, T1
-        MULADD18    R3, R4, R5, R6, R7, R0, R1, R2, pD + 24, pA, pB + 24, T0, T1
-        MULADD18    R4, R5, R6, R7, R0, R1, R2, R3, pD + 32, pA, pB + 32, T0, T1
-        MULADD18    R5, R6, R7, R0, R1, R2, R3, R4, pD + 40, pA, pB + 40, T0, T1
-        MULADD18    R6, R7, R0, R1, R2, R3, R4, R5, pD + 48, pA, pB + 48, T0, T1
-        MULADD18    R7, R0, R1, R2, R3, R4, R5, R6, pD + 56, pA, pB + 56, T0, T1
-
-    ENDM    ;MULADD88
-
-HALF_SQUARE_NODIAG8      MACRO   R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, T0, T1
-        ; pre & post: Cy = Ov = 0
-        ; R[7-0]:D[7-0] = D[7:0] + (A[0:7]^2 - \sum_{i=0}^7 (A[i] * 2^{64*i}) )/2 
-        ; This is the component of the square that needs to be doubled, and then the diagonals added
-        ; rdx is volatile
-
-        ; Note that Dst[0] is not changed by this macro
-
-        mov     rdx, [pA + 0 * 8]           ; rdx = A0
-        mov     R1, [pD + 1 * 8]
-        mov     R2, [pD + 2 * 8]
-        mov     R3, [pD + 3 * 8]
-        mov     R4, [pD + 4 * 8]
-        mov     R5, [pD + 5 * 8]
-        mov     R6, [pD + 6 * 8]
-        mov     R7, [pD + 7 * 8]
-        xor     R0, R0
-        
-        mulx    T1, T0, [pA + 1 * 8]
-        adcx    R1, T0
-        adox    R2, T1
-
-        mulx    T1, T0, [pA + 2 * 8]
-        adcx    R2, T0
-        adox    R3, T1
-
-        mulx    T1, T0, [pA + 3 * 8]
-        adcx    R3, T0
-        adox    R4, T1
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R4, T0
-        adox    R5, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R5, T0
-        adox    R6, T1
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R6, T0
-        adox    R7, T1
-
-        mulx    T1, T0, [pA + 7 * 8]
-        adcx    R7, T0
-        mov     [pD + 1 * 8], R1
-
-        adcx    R0, R0
-        adox    R0, T1
-        mov     [pD + 2 * 8], R2
-        mov     rdx, [pA + 1 * 8]       ; rdx = A1
-
-        ;=======
-
-        mulx    T1, T0, [pA + 2 * 8]
-        adcx    R3, T0
-        adox    R4, T1
-
-        mulx    T1, T0, [pA + 3 * 8]
-        adcx    R4, T0
-        adox    R5, T1
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R5, T0
-        adox    R6, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R6, T0
-        adox    R7, T1
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R7, T0
-        adox    R0, T1
-
-        mov     rdx, [pA + 7 * 8]       ; rdx = A7
-        mov     R1, 0
-        mov     R2, 0
-        mov     [pD + 3 * 8], R3
-
-        mulx    T1, T0, [pA + 1 * 8]
-        adcx    R0, T0
-        adox    R1, T1                  ; doesn't produce Ov as T1 <= 0xff..fe and R1=0
-
-        mulx    T1, T0, [pA + 2 * 8]
-        adcx    R1, T0                  
-        mov     [pD + 4 * 8], R4
-
-        adcx    R2, T1
-        mov     rdx, [pA + 2 * 8]       ;rdx = A2
-
-        ;======
-
-        mulx    T1, T0, [pA + 3 * 8]
-        adcx    R5, T0
-        adox    R6, T1
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R6, T0
-        adox    R7, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R7, T0
-        adox    R0, T1
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R0, T0
-        adox    R1, T1
-
-        mov     rdx, [pA + 4 * 8]       ; rdx = A4
-        mov     R3, 0
-        mov     R4, 0
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R1, T0
-        adox    R2, T1
-
-        mulx    T1,T0, [pA + 6 * 8]
-        adcx    R2, T0
-        adox    R3, T1                  ; doesn't produce Ov as T1 <= 0xff..fe and R3=0
-
-        mov     rdx, [pA + 5 * 8]       ;rdx = A5
-        mov     [pD + 5 * 8], R5
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R3, T0
-        adcx    R4, T1
-
-        mov     rdx, [pA + 3 * 8]       ;rdx = A3
-        mov     [pD + 6 * 8], R6
-
-        ;======
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R7, T0
-        adox    R0, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R0, T0
-        adox    R1, T1
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R1, T0
-        adox    R2, T1
-
-        mulx    T1, T0, [pA + 7 * 8]
-        adcx    R2, T0
-        adox    R3, T1
-
-        mov     rdx, [pA + 7 * 8]       ;rdx = A7
-        mov     R5, 0
-        mov     R6, 0
-        mov     [pD + 7 * 8], R7
-
-        mulx    T1, T0, [pA + 4 * 8]
-        adcx    R3, T0
-        adox    R4, T1
-
-        mulx    T1, T0, [pA + 5 * 8]
-        adcx    R4, T0
-        adox    R5, T1                  ; doesn't produce Ov as T1 <= 0xff..fe and R5=0
-
-        mulx    T1, T0, [pA + 6 * 8]
-        adcx    R5, T0
-        adcx    R6, T1
-
-        xor     R7, R7
-
-    ENDM
-    
-MONTGOMERY18    MACRO   R0, R1, R2, R3, R4, R5, R6, R7, modInv, pMod, pMont, T0, T1
-    ; Mont[0] = (modinv * R0 mod 2^64) 
-    ; R0:R[7:1]:<phantom> = Mont[0] * Mod[7:0] + R[7:0]
-    ; Pre: -
-    ; Post: -
-        mov     rdx, R0
-        imul    rdx, modInv
-
-        mov     [pMont], rdx
-
-        xor     T0, T0      ; Reset Cy = Ov = 0
-
-        mulx    T1, T0, [pMod + 0 * 8]
-        adcx    R0, T0                          ; R0 = 0 here, but it produces a carry unless R0=0 at the start
-        adox    R1, T1
-
-        mulx    T1, T0, [pMod + 1 * 8]
-        adcx    R1, T0
-        adox    R2, T1
-
-        mulx    T1, T0, [pMod + 2 * 8]
-        adcx    R2, T0
-        adox    R3, T1
-
-        mulx    T1, T0, [pMod + 3 * 8]
-        adcx    R3, T0
-        adox    R4, T1
-
-        mulx    T1, T0, [pMod + 4 * 8]
-        adcx    R4, T0
-        adox    R5, T1
-
-        mulx    T1, T0, [pMod + 5 * 8]
-        adcx    R5, T0
-        adox    R6, T1
-
-        mulx    T1, T0, [pMod + 6 * 8]
-        adcx    R6, T0
-        adox    R7, T1
-
-        mulx    T1, T0, [pMod + 7 * 8]
-        adcx    R7, T0
-
-        ; R0 = 0 here due to our modinv invariant...
-
-        adcx    R0, R0
-        adox    R0, T1
-
-    ENDM
-
-ZEROREG     MACRO   R
-        xor     R,R
-    ENDM
-
-ZEROREG_8   MACRO   R0, R1, R2, R3, R4, R5, R6, R7
-        ZEROREG R0
-        ZEROREG R1
-        ZEROREG R2
-        ZEROREG R3
-        ZEROREG R4
-        ZEROREG R5
-        ZEROREG R6
-        ZEROREG R7
-    ENDM
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-
-SymCryptFdefRawMulMulx_Frame struct
-        SavedRbp        dq  ?
-        SavedRbx        dq  ?
-        SavedRsi        dq  ?
-        SavedRdi        dq  ?
-        SavedR15        dq  ?
-        SavedR14        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        returnaddress   dq  ?
-        pSrc1Home       dq  ?
-        nDigits1Home    dq  ?
-        pSrc2Home       dq  ?
-        nDigits2Home    dq  ?
-        pDst            dq  ?
-
-SymCryptFdefRawMulMulx_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdefRawMulMulx, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        mov             [rsp + SymCryptFdefRawMulMulx_Frame.pSrc1Home], rcx
-        mov             [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home], rdx
-        mov             [rsp + SymCryptFdefRawMulMulx_Frame.pSrc2Home], r8
-        mov             [rsp + SymCryptFdefRawMulMulx_Frame.nDigits2Home], r9
-
-        ; rcx = pSrc1
-        ; rdx = nDigits1
-        ; r8 = pSrc2
-        ; r9 = nDigits2
-        ; pDst on stack
-
-        ; pSrc1/Digits1 = outer loop
-        ; pSrc2/Digits2 = inner loop
-
-        ; First we wipe nDigits2 of the result (size of in)
-        mov         rbx,[rsp + SymCryptFdefRawMulMulx_Frame.pDst]
-        mov         rdi, rbx
-
-        ; Wipe destination for nDigit2 blocks
-        xorps       xmm0,xmm0               ; Zero register for 16-byte wipes
-        mov         rax, r9
-
-SymCryptFdefRawMulMulxWipeLoop:
-        movaps      [rbx],xmm0
-        movaps      [rbx+16],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+32],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+48],xmm0           ; Wipe 32 bytes
-        add         rbx, 64
-        sub         rax, 1
-        jnz         SymCryptFdefRawMulMulxWipeLoop
-
-
-SymCryptFdefRawMulxOuterLoop:
-
-        ZEROREG_8   rsi, rbp, r10, r11, r12, r13, r14, r15      ; Leaves Cy = Ov = 0
-
-SymCryptFdefRawMulMulxInnerLoop:
-
-        ; Register allocation in loops:
-        ; rsi, rbp, r10, r11, r12, r13, r14, r15    8-word carry 
-        ; rax, rbx                                  temps for multiplication
-        ; rcx, r8                                   pSrc1, pSrc2 running pointers
-        ; r9                                        inner loop counter
-        ; rdx                                       fixed input reg for multiplication
-        ; rdi                                       Destination running pointer inner loop
-        ; rsp[pDst]                                 Destination running pointer outer loop
-        ; rsp[nDigits1]                             outer loop counter
-
-        MULADD88  rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx
-
-        add     r8, 64              ; Src2 ptr
-        add     rdi, 64
-
-        sub     r9d, 1                              ; sets Cy = Ov = 0 because r9 < 2^32 / 64
-        jnz     SymCryptFdefRawMulMulxInnerLoop
-
-        ; Write the 8-word carry-out to the destination
-        mov     [rdi + 0*8], rsi
-        mov     [rdi + 1*8], rbp
-        mov     [rdi + 2*8], r10
-        mov     [rdi + 3*8], r11
-        mov     [rdi + 4*8], r12
-        mov     [rdi + 5*8], r13
-        mov     [rdi + 6*8], r14
-        mov     [rdi + 7*8], r15
-
-        ; set up for next iteration
-        ; reset rdi & increment
-        mov     rdi, [rsp + SymCryptFdefRawMulMulx_Frame.pDst]
-        add     rdi, 64
-        mov     [rsp + SymCryptFdefRawMulMulx_Frame.pDst], rdi
-
-        ; reload pSrc2/nDigits2
-        mov     r9, [rsp + SymCryptFdefRawMulMulx_Frame.nDigits2Home]
-        mov     r8, [rsp + SymCryptFdefRawMulMulx_Frame.pSrc2Home]
-
-        ; update PSrc1
-        add     rcx, 64
-
-        ; nDigits1 loop counter
-        mov     rax, [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home]
-        sub     rax, 1                              ; leaves Cy = Ov = 0 because nDigits1 < 2^32 / 64
-        mov     [rsp + SymCryptFdefRawMulMulx_Frame.nDigits1Home], rax
-
-        jnz     SymCryptFdefRawMulxOuterLoop
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-               
-    NESTED_END      SymCryptFdefRawMulMulx, _TEXT
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquare(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-
-SymCryptFdefRawSquareMulx_Frame struct
-
-        SavedRbp        dq  ?
-        SavedRbx        dq  ?
-        SavedRsi        dq  ?
-        SavedRdi        dq  ?
-        SavedR15        dq  ?
-        SavedR14        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        returnaddress   dq  ?
-        pSrcHome        dq  ?
-
-        ; Two 32-bit local variables, in the space of one normal 64-bit stack slot
-        nDigitsHome     dd  ?       ; 32 bits, original argument to function
-        nextNDigits     dd  ?       ; 32 bits; number of digits to do in the next sequence of inner loops.
-
-        pDstHome        dq  ?
-        pDstPtr         dq  ?       ; pDst running pointer outer loop (This is the 4th argument stack slot which is always available.)
-
-SymCryptFdefRawSquareMulx_Frame        ends
-
-        NESTED_ENTRY    SymCryptFdefRawSquareMulx, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-
-        END_PROLOGUE
-
-        ; rcx = pSrc
-        ; rdx = nDigits
-        ; r8 = pDst
-
-        ; Save parameters for phase 2
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.pSrcHome], rcx
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.nDigitsHome], edx
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.pDstHome], r8
-
-        ; Initialize our local variables
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits], edx
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr], r8
-
-        mov         r9d, edx                ; rdx is used in the multiplications...
-
-        ; Wipe destination for nDigits blocks
-
-        xor         rax, rax
-        mov         rbx, r8
-        ; we'll use the edx digit counter destructively...
-
-SymCryptFdefRawSquareMulxWipeLoop:
-        ; we use 8-byte writes as we will be reading this very soon in 8-byte chunks, and this way the store-load 
-        ; forwarding works 
-        mov         [rbx     ], rax
-        mov         [rbx +  8], rax
-        mov         [rbx + 16], rax
-        mov         [rbx + 24], rax
-        mov         [rbx + 32], rax
-        mov         [rbx + 40], rax
-        mov         [rbx + 48], rax
-        mov         [rbx + 56], rax
-        add         rbx, 64
-        sub         edx, 1
-        jnz         SymCryptFdefRawSquareMulxWipeLoop
-
-        ; Cy = Ov = 0 here because the last 'sub edx,1' yielded 0
-
-SymCryptFdefRawSquareMulxOuterLoop:
-
-        HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15,  r8, rcx, rax, rbx
-
-        sub     r9d, 1
-        jz      SymCryptFdefRawSquareMulxPhase2     ; end of phase 1
-
-        lea     rdi, [rcx + 64]
-        lea     r8, [r8 + 64]
-
-SymCryptFdefRawSquareMulxInnerLoop:
-        ; rsi, rbp, r10, r11, r12, r13, r14, r15    8-word carry 
-        ; rax, rbx                                  temps for multiplication
-        ; rcx                                       pSrc running pointer outer loop
-        ; r8                                        pDst running pointer inner loop
-        ; r9d                                       inner loop nDigit counter
-        ; rdx                                       fixed input reg for multiplication
-        ; rdi                                       pSrc running pointer inner loop
-
-        ; rsp[pSrc]                                 pSrc (used for final pass)
-        ; rsp[nDigits]                              nDigits (used for final pass)
-        ; rsp[pDst]                                 pDst (used for final pass)
-        ; rsp[nextNDigits]                          # inner loop blocks in next outer loop iteration
-        ; rsp[pDstPtr]                              pDst running pointer outer loop
-            
-        MULADD88    rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx
-
-        add     r8, 64                 
-        add     rdi, 64
-
-        sub     r9d, 1                  ; Sets Cy = Ov = 0 because r9d < 2^32 / bits_per_digit
-        jnz     SymCryptFdefRawSquareMulxInnerLoop
-
-        ; Write the 8-word carry-out to the destination
-        mov     [r8 + 0*8], rsi
-        mov     [r8 + 1*8], rbp
-        mov     [r8 + 2*8], r10
-        mov     [r8 + 3*8], r11
-        mov     [r8 + 4*8], r12
-        mov     [r8 + 5*8], r13
-        mov     [r8 + 6*8], r14
-        mov     [r8 + 7*8], r15
-
-        add     rcx, 64
-
-        mov     r8, [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr]
-        add     r8, 128                             ; Shift output ptr by 2 digits
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.pDstPtr], r8
-
-        mov     r9d, [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits]
-        sub     r9d, 1
-        mov     [rsp + SymCryptFdefRawSquareMulx_Frame.nextNDigits], r9d
-
-        jmp     SymCryptFdefRawSquareMulxOuterLoop
-
-
-SymCryptFdefRawSquareMulxPhase2:
-        ; Cy = Ov = 0 because last 'sub r9d, 1' resulted in 0
-
-        ; Write the 8-word carry-out to the destination
-        mov     [r8 +  8*8], rsi
-        mov     [r8 +  9*8], rbp
-        mov     [r8 + 10*8], r10
-        mov     [r8 + 11*8], r11
-        mov     [r8 + 12*8], r12
-        mov     [r8 + 13*8], r13
-        mov     [r8 + 14*8], r14
-        mov     [r8 + 15*8], r15
-
-        ; Compute diagonals, and add double the result so far
-
-        mov     rcx, [rsp + SymCryptFdefRawSquareMulx_Frame.pSrcHome]
-        mov     r9d, [rsp + SymCryptFdefRawSquareMulx_Frame.nDigitsHome]
-        mov     r8, [rsp + SymCryptFdefRawSquareMulx_Frame.pDstHome]
-
-        ; We can't keep the carries in Cy and Ov because there is no way to do a loop counter
-        ; without touching the Ov flag.
-        ; So we set the Ov carry in rsi, and retain a zero in rdi
-        xor     esi, esi
-        xor     edi, edi
-
-SymCryptFdefRawSquareMulxDiagonalsLoop:
-        ; Cy = carry in
-        ; esi = carry in (1 bit)
-        ; Ov = 0
-
-SYMCRYPT_SQUARE_DIAG    MACRO   index
-        mov     rdx, [rcx + 8 * index]
-        mov     r10, [r8 + 16 * index]
-        mov     r11, [r8 + 16 * index + 8]
-        mulx    rbx, rax, rdx
-        adcx    rax, r10
-        adox    rax, r10
-        adcx    rbx, r11
-        adox    rbx, r11
-        mov     [r8 + 16 * index], rax
-        mov     [r8 + 16 * index + 8], rbx
-    ENDM
-
-        ; First word is different to handle the carry
-        ; SYMCRYPT_SQUARE_DIAG    0 
-        mov     rdx, [rcx]
-        mov     r10, [r8]
-        mov     r11, [r8 + 8]
-        mulx    rbx, rax, rdx
-        adcx    rax, rsi            ; add both carries
-        adcx    rbx, rdi            ; rdi = 0; now Cy = 0 because result of multiply <= ff..fe00..01
-
-        adcx    rax, r10
-        adox    rax, r10
-        adcx    rbx, r11
-        adox    rbx, r11
-        mov     [r8 ], rax
-        mov     [r8 + 8], rbx
-
-        SYMCRYPT_SQUARE_DIAG    1
-        SYMCRYPT_SQUARE_DIAG    2
-        SYMCRYPT_SQUARE_DIAG    3
-        SYMCRYPT_SQUARE_DIAG    4
-        SYMCRYPT_SQUARE_DIAG    5
-        SYMCRYPT_SQUARE_DIAG    6
-        SYMCRYPT_SQUARE_DIAG    7
-
-        ; Move the Ov flag into esi
-        mov     esi, edi
-        adox    esi, edi
-
-        ; There is no way to do a loop counter without overwriting the Ov flag
-        ; Even the 'dec' instruction touches it, and LAHF/SAHF doesn't load/store the Ov flag.
-        ; We can't push/pop efl in a function body
-
-        lea     rcx, [rcx + 64]
-        lea     r8, [r8 + 128]
-        dec     r9d     
-        jnz     SymCryptFdefRawSquareMulxDiagonalsLoop
-
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-
-        NESTED_END      SymCryptFdefRawSquareMulx, _TEXT
-
-
-
-
-
-;VOID
-;SymCryptFdefMontgomeryReduce(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-SymCryptFdefMontgomeryReduceMulx_Frame struct
-
-        SavedRbp        dq  ?
-        SavedRbx        dq  ?
-        SavedRsi        dq  ?
-        SavedRdi        dq  ?
-        SavedR15        dq  ?
-        SavedR14        dq  ?
-        SavedR13        dq  ?
-        SavedR12        dq  ?
-        returnaddress   dq  ?
-
-        pmModHome       dq  ?
-        pSrcHome        dq  ?
-        pDstHome        dq  ?
-
-        ; two 4-byte variables in P4Home
-        CntOuter        dd  ?       ; outer loop counter
-        HighCarry       dd  ?
-
-SymCryptFdefMontgomeryReduceMulx_Frame        ends
-
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduceMulx, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        ; rcx = pmMod
-        ; rdx = pSrc = scratch buffer
-        ; r8 = pDst
-
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome], rcx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], rdx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome], r8
-
-        mov     r8, rdx
-
-        mov     eax, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax
-        ; CntOuter = nDigits
-
-        xor     ebx, ebx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], ebx
-        ; HighCarry = 0
-
-SymCryptFdefMontgomeryReduceMulxOuterLoop:
-        ; rcx = pmMod
-        ; r8 = pSrc = tmp buffer that we will reduce
-        mov     rsi, [r8 + 0 * 8]        
-        mov     rbp, [r8 + 1 * 8]        
-        mov     r10, [r8 + 2 * 8]        
-        mov     r11, [r8 + 3 * 8]        
-        mov     r12, [r8 + 4 * 8]        
-        mov     r13, [r8 + 5 * 8]        
-        mov     r14, [r8 + 6 * 8]        
-        mov     r15, [r8 + 7 * 8]        
-
-        mov     rdi, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-        mov     r9d, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        ; r8 = value to reduce
-        ; rsi - r15= r8[0..7]
-        ; rcx = modulus value
-        ; rdi = modinv
-
-        MONTGOMERY18    rsi, rbp, r10, r11, r12, r13, r14, r15,  rdi, rcx, r8 + 0 * 8, rax, rbx
-        MONTGOMERY18    rbp, r10, r11, r12, r13, r14, r15, rsi,  rdi, rcx, r8 + 1 * 8, rax, rbx
-        MONTGOMERY18    r10, r11, r12, r13, r14, r15, rsi, rbp,  rdi, rcx, r8 + 2 * 8, rax, rbx
-        MONTGOMERY18    r11, r12, r13, r14, r15, rsi, rbp, r10,  rdi, rcx, r8 + 3 * 8, rax, rbx
-        MONTGOMERY18    r12, r13, r14, r15, rsi, rbp, r10, r11,  rdi, rcx, r8 + 4 * 8, rax, rbx
-        MONTGOMERY18    r13, r14, r15, rsi, rbp, r10, r11, r12,  rdi, rcx, r8 + 5 * 8, rax, rbx
-        MONTGOMERY18    r14, r15, rsi, rbp, r10, r11, r12, r13,  rdi, rcx, r8 + 6 * 8, rax, rbx
-        MONTGOMERY18    r15, rsi, rbp, r10, r11, r12, r13, r14,  rdi, rcx, r8 + 7 * 8, rax, rbx
-
-        ; rsi .. r15 = carry from multiply-add
-        ; r8[0..7] = Montgomery factors
-
-        mov     rdi, r8         ; factor to multiply by
-        add     rcx, 64
-        add     r8, 64
-
-        sub     r9d, 1
-        jz      SymCryptFdefMontgomeryReduceMulxInnerLoopDone
-
-SymCryptFdefMontgomeryReduceMulxInnerLoop:
-        
-        ; rsi, rbp, r10, r11, r12, r13, r14, r15    8-word carry 
-        ; rax, rbx                                  temps for multiplication
-        ; rcx                                       running pointer pMod inner loop 
-        ; r8                                        running pointer pSrc inner loop
-        ; rdi                                       Montgomery factors for this row
-        ; r9                                        loop ctr
-        ; rdx                                       fixed input reg for multiplication
-
-        MULADD88    rsi, rbp, r10, r11, r12, r13, r14, r15,  r8, rcx, rdi, rax, rbx
-            ; pre & post: Cy = Ov = 0
-            ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
-            ; rdx is volatile
-
-        add     rcx, 64
-        add     r8, 64
-        sub     r9d, 1
-        jnz     SymCryptFdefMontgomeryReduceMulxInnerLoop    
-
-
-SymCryptFdefMontgomeryReduceMulxInnerLoopDone:
-
-        ; We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry
-        ; We also saved a 1-bit carry from the previous outer loop
-        xor     edx, edx
-        mov     eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry]
-        ; move carry into Cy flag
-        neg     eax
-
-        ; We do this in separate instructions to help the instruction decoder build up a lead...
-        mov     rax, [r8 + 0 * 8]
-        adc     rax, rsi
-        mov     [r8 + 0 * 8], rax
-
-        mov     rbx, [r8 + 1 * 8]
-        adc     rbx, rbp
-        mov     [r8 + 1 * 8], rbx
-
-        mov     rax, [r8 + 2 * 8]
-        adc     rax, r10
-        mov     [r8 + 2 * 8], rax
-
-        mov     rbx, [r8 + 3 * 8]
-        adc     rbx, r11
-        mov     [r8 + 3 * 8], rbx
-
-        mov     rax, [r8 + 4 * 8]
-        adc     rax, r12
-        mov     [r8 + 4 * 8], rax
-
-        mov     rbx, [r8 + 5 * 8]
-        adc     rbx, r13
-        mov     [r8 + 5 * 8], rbx
-
-        mov     rax, [r8 + 6 * 8]
-        adc     rax, r14
-        mov     [r8 + 6 * 8], rax
-
-        mov     rbx, [r8 + 7 * 8]
-        adc     rbx, r15
-        mov     [r8 + 7 * 8], rbx
-        
-        adc     edx, edx                ; edx = carry
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], edx
-
-        mov     r8, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome]
-        add     r8, 64
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], r8
-
-        mov     rcx, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome]
-
-        mov     eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter]
-        sub     eax, 1
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax
-
-        jnz     SymCryptFdefMontgomeryReduceMulxOuterloop
-
-        ; edx = output carry
-
-        mov     esi, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        mov     rdi, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome]
-
-        ; r8 = result buffer pointer
-        ; esi = # digits
-        ; rcx = modulus value
-        ; rdi = Dst
-
-        ; copy these values for the maked copy loop
-        mov     r9d, esi    ; nDigits
-        mov     r10, r8     ; result buffer
-        mov     rbp, rdi    ; destination pointer
-
-        ; pDst = Reduction result - Modulus
-
-SymCryptFdefMontgomeryReduceMulxSubLoop:
-        mov     rax,[r8 + 0 * 8]
-        sbb     rax,[rcx + 0 * 8]
-        mov     [rdi + 0 * 8], rax
-
-        mov     rbx,[r8 + 1 * 8]
-        sbb     rbx,[rcx + 1 * 8]
-        mov     [rdi + 1 * 8], rbx
-
-        mov     rax,[r8 + 2 * 8]
-        sbb     rax,[rcx + 2 * 8]
-        mov     [rdi + 2 * 8], rax
-
-        mov     rbx,[r8 + 3 * 8]
-        sbb     rbx,[rcx + 3 * 8]
-        mov     [rdi + 3 * 8], rbx
-
-        mov     rax,[r8 + 4 * 8]
-        sbb     rax,[rcx + 4 * 8]
-        mov     [rdi + 4 * 8], rax
-
-        mov     rbx,[r8 + 5 * 8]
-        sbb     rbx,[rcx + 5 * 8]
-        mov     [rdi + 5 * 8], rbx
-
-        mov     rax,[r8 + 6 * 8]
-        sbb     rax,[rcx + 6 * 8]
-        mov     [rdi + 6 * 8], rax
-
-        mov     rbx,[r8 + 7 * 8]
-        sbb     rbx,[rcx + 7 * 8]
-        mov     [rdi + 7 * 8], rbx
-
-        lea     r8, [r8 + 64]
-        lea     rcx, [rcx + 64]
-        lea     rdi, [rdi + 64]
-        dec     esi
-        jnz     SymCryptFdefMontgomeryReduceMulxSubLoop
-
-        ; now a masked copy from the reduction buffer to the destination.
-        ; copy if high carry = 0 and Cy = 1
-        sbb     edx, 0
-        ; edx = copy mask, ff...ff  if copy, 0 of no copy
-
-        movd    xmm0, edx           ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff    
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop:
-        movdqa  xmm2, [r10 + 0 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 0 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 0 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 1 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 1 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 1 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 2 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 2 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 2 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 3 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 3 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 3 * 16], xmm2
-
-        ; Move on to the next digit
-
-        add     r10, 64
-        add     rbp, 64
-        sub     r9d, 1
-        jnz     SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-
-        NESTED_END      SymCryptFdefMontgomeryReduceMulx, _TEXT
-
-; --------------------------------
-; 1024-bit size specific functions
-; --------------------------------
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-
-        NESTED_ENTRY    SymCryptFdefRawMulMulx1024, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        ; First we wipe nDigits2 of the result (size of in)
-        mov         rbx, r9
-        mov         rdi, r9
-
-        mov         r9, r8
-        mov         r8, rdx
-
-        ; rcx = pSrc1
-        ; r8 = pSrc2
-        ; r9 = nDigits
-
-        ; Wipe destination for nDigit2 blocks
-        xorps       xmm0,xmm0               ; Zero register for 16-byte wipes
-
-        movaps      [rbx],xmm0
-        movaps      [rbx+16],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+32],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+48],xmm0           ; Wipe 32 bytes
-
-        movaps      [rbx+64],xmm0
-        movaps      [rbx+80],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+96],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+112],xmm0          ; Wipe 32 bytes
-
-        ; Digit 1 from src2
-
-        ZEROREG_8   rsi, rbp, r10, r11, r12, r13, r14, r15      ; Leaves Cy = Ov = 0
-
-        MULADD88  rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx
-
-        add     r8, 64              ; Src2 ptr
-        add     rdi, 64
-        xor     rax, rax            ; sets Cy = Ov = 0
-
-        MULADD88  rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx
-
-        add     rdi, 64
-
-        ; Write the 8-word carry-out to the destination
-        mov     [rdi + 0*8], rsi
-        mov     [rdi + 1*8], rbp
-        mov     [rdi + 2*8], r10
-        mov     [rdi + 3*8], r11
-        mov     [rdi + 4*8], r12
-        mov     [rdi + 5*8], r13
-        mov     [rdi + 6*8], r14
-        mov     [rdi + 7*8], r15
-
-        ; Digit 2 from src2
-
-        ; set up
-
-        ; Mov rdi one digit back
-        sub     rdi, 64
-
-        ; reload pSrc2
-        sub     r8, 64
-
-        ; update PSrc1
-        add     rcx, 64
-
-        ZEROREG_8   rsi, rbp, r10, r11, r12, r13, r14, r15      ; Leaves Cy = Ov = 0
-
-        MULADD88  rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx
-
-        add     r8, 64              ; Src2 ptr
-        add     rdi, 64
-        xor     rax, rax            ; sets Cy = Ov = 0
-
-        MULADD88  rsi, rbp, r10, r11, r12, r13, r14, r15, rdi, rcx, r8, rax, rbx
-
-        add     rdi, 64
-
-        ; Write the 8-word carry-out to the destination
-        mov     [rdi + 0*8], rsi
-        mov     [rdi + 1*8], rbp
-        mov     [rdi + 2*8], r10
-        mov     [rdi + 3*8], r11
-        mov     [rdi + 4*8], r12
-        mov     [rdi + 5*8], r13
-        mov     [rdi + 6*8], r14
-        mov     [rdi + 7*8], r15
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-               
-    NESTED_END      SymCryptFdefRawMulMulx1024, _TEXT
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquare(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-
-        NESTED_ENTRY    SymCryptFdefRawSquareMulx1024, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-
-        END_PROLOGUE
-
-        ; rcx = pSrc
-        ; rdx = nDigits     // (ignored)
-        ; r8 = pDst
-
-        ; Save parameters for phase 2
-        mov     r9, r8                  ; pDst
-
-        ; Wipe destination for nDigits blocks
-
-        xor     rax, rax
-        mov     rbx, r8
-        ; we'll use the edx digit counter destructively...
-
-        ; Wipe destination
-        xorps       xmm0,xmm0               ; Zero register for 16-byte wipes
-
-        movaps      [rbx],xmm0
-        movaps      [rbx+16],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+32],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+48],xmm0           ; Wipe 32 bytes
-
-        movaps      [rbx+64],xmm0
-        movaps      [rbx+80],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+96],xmm0           ; Wipe 32 bytes
-        movaps      [rbx+112],xmm0          ; Wipe 32 bytes
-
-        ; Cy = Ov = 0 here
-
-        HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15,  r8, rcx, rax, rbx
-
-        lea     rdi, [rcx + 64]
-        lea     r8, [r8 + 64]
-
-        ; rsi, rbp, r10, r11, r12, r13, r14, r15    8-word carry 
-        ; rax, rbx                                  temps for multiplication
-        ; rcx                                       pSrc running pointer outer loop
-        ; r8                                        pDst running pointer inner loop
-        ; rdx                                       fixed input reg for multiplication
-        ; rdi                                       pSrc running pointer inner loop
-
-        ; rsp[pSrc]                                 pSrc (used for final pass)
-        ; rsp[nDigits]                              nDigits (used for final pass)
-        ; rsp[pDst]                                 pDst (used for final pass)
-        ; rsp[nextNDigits]                          # inner loop blocks in next outer loop iteration
-        ; rsp[pDstPtr]                              pDst running pointer outer loop
-            
-        MULADD88    rsi, rbp, r10, r11, r12, r13, r14, r15, r8, rcx, rdi, rax, rbx
-
-        add     r8, 64                 
-        add     rdi, 64
-
-        ; Write the 8-word carry-out to the destination
-        mov     [r8 + 0*8], rsi
-        mov     [r8 + 1*8], rbp
-        mov     [r8 + 2*8], r10
-        mov     [r8 + 3*8], r11
-        mov     [r8 + 4*8], r12
-        mov     [r8 + 5*8], r13
-        mov     [r8 + 6*8], r14
-        mov     [r8 + 7*8], r15
-
-        add     rcx, 64
-
-        ; r8 which is the destination pointer is shifted here by 2 digits
-
-        xor     rax, rax                        ; Sets Cy = Ov = 0 
-
-        HALF_SQUARE_NODIAG8 rsi, rbp, r10, r11, r12, r13, r14, r15,  r8, rcx, rax, rbx
-
-        ; Cy = Ov = 0 because last 'sub r9d, 1' resulted in 0
-
-        ; Write the 8-word carry-out to the destination
-        mov     [r8 +  8*8], rsi
-        mov     [r8 +  9*8], rbp
-        mov     [r8 + 10*8], r10
-        mov     [r8 + 11*8], r11
-        mov     [r8 + 12*8], r12
-        mov     [r8 + 13*8], r13
-        mov     [r8 + 14*8], r14
-        mov     [r8 + 15*8], r15
-
-        ; Compute diagonals, and add double the result so far
-
-        sub     rdi, 128                    ; Revert rdi back to pSrcHome
-        mov     rcx, rdi
-        mov     r8, r9
-
-        xor     rax, rax                    ; Sets Cy = Ov = 0 
-
-        SYMCRYPT_SQUARE_DIAG    0
-        SYMCRYPT_SQUARE_DIAG    1
-        SYMCRYPT_SQUARE_DIAG    2
-        SYMCRYPT_SQUARE_DIAG    3
-        SYMCRYPT_SQUARE_DIAG    4
-        SYMCRYPT_SQUARE_DIAG    5
-        SYMCRYPT_SQUARE_DIAG    6
-        SYMCRYPT_SQUARE_DIAG    7
-
-        SYMCRYPT_SQUARE_DIAG    8
-        SYMCRYPT_SQUARE_DIAG    9
-        SYMCRYPT_SQUARE_DIAG   10
-        SYMCRYPT_SQUARE_DIAG   11
-        SYMCRYPT_SQUARE_DIAG   12
-        SYMCRYPT_SQUARE_DIAG   13
-        SYMCRYPT_SQUARE_DIAG   14
-        SYMCRYPT_SQUARE_DIAG   15
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-
-        NESTED_END      SymCryptFdefRawSquareMulx1024, _TEXT
-
-;VOID
-;SymCryptFdefMontgomeryReduce(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-
-        NESTED_ENTRY    SymCryptFdefMontgomeryReduceMulx1024, _TEXT
-
-        ; We need all the registers
-        push_reg        r12
-        push_reg        r13
-        push_reg        r14
-        push_reg        r15
-        push_reg        rdi
-        push_reg        rsi
-        push_reg        rbx
-        push_reg        rbp
-        
-        END_PROLOGUE
-
-        ; rcx = pmMod
-        ; rdx = pSrc = scratch buffer
-        ; r8 = pDst
-
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome], rcx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], rdx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome], r8
-
-        mov     r8, rdx
-
-        mov     eax, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax
-        ; CntOuter = nDigits
-
-        xor     ebx, ebx
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], ebx
-        ; HighCarry = 0
-
-SymCryptFdefMontgomeryReduceMulx1024OuterLoop:
-        ; rcx = pmMod
-        ; r8 = pSrc = tmp buffer that we will reduce
-        mov     rsi, [r8 + 0 * 8]        
-        mov     rbp, [r8 + 1 * 8]        
-        mov     r10, [r8 + 2 * 8]        
-        mov     r11, [r8 + 3 * 8]        
-        mov     r12, [r8 + 4 * 8]        
-        mov     r13, [r8 + 5 * 8]        
-        mov     r14, [r8 + 6 * 8]        
-        mov     r15, [r8 + 7 * 8]        
-
-        mov     rdi, [rcx + SymCryptModulusMontgomeryInv64OffsetAmd64]          ; inv64
-        mov     r9d, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        ; r8 = value to reduce
-        ; rsi - r15= r8[0..7]
-        ; rcx = modulus value
-        ; rdi = modinv
-
-        MONTGOMERY18    rsi, rbp, r10, r11, r12, r13, r14, r15,  rdi, rcx, r8 + 0 * 8, rax, rbx
-        MONTGOMERY18    rbp, r10, r11, r12, r13, r14, r15, rsi,  rdi, rcx, r8 + 1 * 8, rax, rbx
-        MONTGOMERY18    r10, r11, r12, r13, r14, r15, rsi, rbp,  rdi, rcx, r8 + 2 * 8, rax, rbx
-        MONTGOMERY18    r11, r12, r13, r14, r15, rsi, rbp, r10,  rdi, rcx, r8 + 3 * 8, rax, rbx
-        MONTGOMERY18    r12, r13, r14, r15, rsi, rbp, r10, r11,  rdi, rcx, r8 + 4 * 8, rax, rbx
-        MONTGOMERY18    r13, r14, r15, rsi, rbp, r10, r11, r12,  rdi, rcx, r8 + 5 * 8, rax, rbx
-        MONTGOMERY18    r14, r15, rsi, rbp, r10, r11, r12, r13,  rdi, rcx, r8 + 6 * 8, rax, rbx
-        MONTGOMERY18    r15, rsi, rbp, r10, r11, r12, r13, r14,  rdi, rcx, r8 + 7 * 8, rax, rbx
-
-        ; rsi .. r15 = carry from multiply-add
-        ; r8[0..7] = Montgomery factors
-
-        mov     rdi, r8         ; factor to multiply by
-        add     rcx, 64
-        add     r8, 64
-        
-        ; rsi, rbp, r10, r11, r12, r13, r14, r15    8-word carry 
-        ; rax, rbx                                  temps for multiplication
-        ; rcx                                       running pointer pMod inner loop 
-        ; r8                                        running pointer pSrc inner loop
-        ; rdi                                       Montgomery factors for this row
-        ; r9                                        loop ctr
-        ; rdx                                       fixed input reg for multiplication
-
-        MULADD88    rsi, rbp, r10, r11, r12, r13, r14, r15,  r8, rcx, rdi, rax, rbx
-            ; pre & post: Cy = Ov = 0
-            ; R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
-            ; rdx is volatile
-
-        add     rcx, 64
-        add     r8, 64
-
-        ; We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry
-        ; We also saved a 1-bit carry from the previous outer loop
-        xor     edx, edx
-        mov     eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry]
-        ; move carry into Cy flag
-        neg     eax
-
-        ; We do this in separate instructions to help the instruction decoder build up a lead...
-        mov     rax, [r8 + 0 * 8]
-        adc     rax, rsi
-        mov     [r8 + 0 * 8], rax
-
-        mov     rbx, [r8 + 1 * 8]
-        adc     rbx, rbp
-        mov     [r8 + 1 * 8], rbx
-
-        mov     rax, [r8 + 2 * 8]
-        adc     rax, r10
-        mov     [r8 + 2 * 8], rax
-
-        mov     rbx, [r8 + 3 * 8]
-        adc     rbx, r11
-        mov     [r8 + 3 * 8], rbx
-
-        mov     rax, [r8 + 4 * 8]
-        adc     rax, r12
-        mov     [r8 + 4 * 8], rax
-
-        mov     rbx, [r8 + 5 * 8]
-        adc     rbx, r13
-        mov     [r8 + 5 * 8], rbx
-
-        mov     rax, [r8 + 6 * 8]
-        adc     rax, r14
-        mov     [r8 + 6 * 8], rax
-
-        mov     rbx, [r8 + 7 * 8]
-        adc     rbx, r15
-        mov     [r8 + 7 * 8], rbx
-        
-        adc     edx, edx                ; edx = carry
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.HighCarry], edx
-
-        mov     r8, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome]
-        add     r8, 64
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pSrcHome], r8
-
-        mov     rcx, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pmModHome]
-
-        mov     eax, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter]
-        sub     eax, 1
-        mov     [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.CntOuter], eax
-
-        jnz     SymCryptFdefMontgomeryReduceMulx1024Outerloop
-
-        ; edx = output carry
-
-        mov     esi, [rcx + SymCryptModulusNdigitsOffsetAmd64]
-        lea     rcx, [rcx + SymCryptModulusValueOffsetAmd64]                    ; modulus value
-
-        mov     rdi, [rsp + SymCryptFdefMontgomeryReduceMulx_Frame.pDstHome]
-
-        ; r8 = result buffer pointer
-        ; esi = # digits
-        ; rcx = modulus value
-        ; rdi = Dst
-
-        ; copy these values for the maked copy loop
-        mov     r9d, esi    ; nDigits
-        mov     r10, r8     ; result buffer
-        mov     rbp, rdi    ; destination pointer
-
-        ; pDst = Reduction result - Modulus
-
-        mov     rax,[r8 + 0 * 8]
-        sbb     rax,[rcx + 0 * 8]
-        mov     [rdi + 0 * 8], rax
-
-        mov     rbx,[r8 + 1 * 8]
-        sbb     rbx,[rcx + 1 * 8]
-        mov     [rdi + 1 * 8], rbx
-
-        mov     rax,[r8 + 2 * 8]
-        sbb     rax,[rcx + 2 * 8]
-        mov     [rdi + 2 * 8], rax
-
-        mov     rbx,[r8 + 3 * 8]
-        sbb     rbx,[rcx + 3 * 8]
-        mov     [rdi + 3 * 8], rbx
-
-        mov     rax,[r8 + 4 * 8]
-        sbb     rax,[rcx + 4 * 8]
-        mov     [rdi + 4 * 8], rax
-
-        mov     rbx,[r8 + 5 * 8]
-        sbb     rbx,[rcx + 5 * 8]
-        mov     [rdi + 5 * 8], rbx
-
-        mov     rax,[r8 + 6 * 8]
-        sbb     rax,[rcx + 6 * 8]
-        mov     [rdi + 6 * 8], rax
-
-        mov     rbx,[r8 + 7 * 8]
-        sbb     rbx,[rcx + 7 * 8]
-        mov     [rdi + 7 * 8], rbx
-
-        mov     rax,[r8 + 8 * 8]
-        sbb     rax,[rcx + 8 * 8]
-        mov     [rdi + 8 * 8], rax
-
-        mov     rbx,[r8 + 9 * 8]
-        sbb     rbx,[rcx + 9 * 8]
-        mov     [rdi + 9 * 8], rbx
-
-        mov     rax,[r8 + 10 * 8]
-        sbb     rax,[rcx + 10 * 8]
-        mov     [rdi + 10 * 8], rax
-
-        mov     rbx,[r8 + 11 * 8]
-        sbb     rbx,[rcx + 11 * 8]
-        mov     [rdi + 11 * 8], rbx
-
-        mov     rax,[r8 + 12 * 8]
-        sbb     rax,[rcx + 12 * 8]
-        mov     [rdi + 12 * 8], rax
-
-        mov     rbx,[r8 + 13 * 8]
-        sbb     rbx,[rcx + 13 * 8]
-        mov     [rdi + 13 * 8], rbx
-
-        mov     rax,[r8 + 14 * 8]
-        sbb     rax,[rcx + 14 * 8]
-        mov     [rdi + 14 * 8], rax
-
-        mov     rbx,[r8 + 15 * 8]
-        sbb     rbx,[rcx + 15 * 8]
-        mov     [rdi + 15 * 8], rbx
-
-
-        ; now a masked copy from the reduction buffer to the destination.
-        ; copy if high carry = 0 and Cy = 1
-        sbb     edx, 0
-        ; edx = copy mask, ff...ff  if copy, 0 of no copy
-
-        movd    xmm0, edx           ; xmm0[0] = mask
-        pcmpeqd xmm1, xmm1          ; xmm1 = ff...ff    
-        pshufd  xmm0, xmm0, 0       ; xmm0[0..3] = mask
-        pxor    xmm1, xmm0          ; xmm1 = not Mask
-
-
-        movdqa  xmm2, [r10 + 0 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 0 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 0 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 1 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 1 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 1 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 2 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 2 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 2 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 3 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 3 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 3 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 4 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 4 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 4 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 5 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 5 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 5 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 6 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 6 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 6 * 16], xmm2
-
-        movdqa  xmm2, [r10 + 7 * 16]    ; xmm2 = pSrc[0]
-        movdqa  xmm3, [rbp + 7 * 16]    ; xmm3 = pDst[0]
-        pand    xmm2, xmm0          
-        pand    xmm3, xmm1           
-        por     xmm2, xmm3
-        movdqa  [rbp + 7 * 16], xmm2
-
-
-        BEGIN_EPILOGUE
-
-        pop     rbp
-        pop     rbx
-        pop     rsi
-        pop     rdi
-        pop     r15
-        pop     r14
-        pop     r13
-        pop     r12    
-        ret
-
-        NESTED_END      SymCryptFdefMontgomeryReduceMulx1024, _TEXT
-
-
-;=============================================================================
-; test code 
-
-MULX_TEST_1 MACRO
-        mulx    rax, rcx, [r8 + 8]
-        adcx    r10, rcx
-        adox    r11, rax
-    ENDM
-
-MULX_TEST_4  MACRO
-        MULX_TEST_1
-        MULX_TEST_1
-        MULX_TEST_1
-        MULX_TEST_1
-    ENDM
-
-MULX_TEST_16  MACRO
-        MULX_TEST_4
-        MULX_TEST_4
-        MULX_TEST_4
-        MULX_TEST_4
-    ENDM
-
-MULX_TEST_64  MACRO
-        MULX_TEST_16
-        MULX_TEST_16
-        MULX_TEST_16
-        MULX_TEST_16
-    ENDM
-
-MULX_TEST_256  MACRO
-        MULX_TEST_64
-        MULX_TEST_64
-        MULX_TEST_64
-        MULX_TEST_64
-    ENDM
-
-MULX_TEST_1024  MACRO
-        MULX_TEST_256
-        MULX_TEST_256
-        MULX_TEST_256
-        MULX_TEST_256
-    ENDM
-
-        LEAF_ENTRY  SymCryptTestMulx, _TEXT
-
-        mov r8, rsp
-
-        MULX_TEST_1024
-
-        ret
-        LEAF_END    SymCryptTestMulx, _TEXT
-
-
-
-        end
diff --git a/lib/amd64/fdef_mulx.symcryptasm b/lib/amd64/fdef_mulx.symcryptasm
new file mode 100644
index 0000000..c072fc5
--- /dev/null
+++ b/lib/amd64/fdef_mulx.symcryptasm
@@ -0,0 +1,1265 @@
+//
+//  fdef_mulx.symcryptasm   Assembler code for large integer arithmetic in the default data format
+//  using the bmi2 instructions mulx, adcx and adox
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#include "symcryptasm_shared.cppasm"
+
+MACRO_START(ZEROREG, R)
+        xor     R,R
+MACRO_END()
+
+MACRO_START(ZEROREG_8, R0, R1, R2, R3, R4, R5, R6, R7)
+    ZEROREG R0
+    ZEROREG R1
+    ZEROREG R2
+    ZEROREG R3
+    ZEROREG R4
+    ZEROREG R5
+    ZEROREG R6
+    ZEROREG R7
+MACRO_END()
+
+MACRO_START(MULADD18, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1, QH)
+    // R0:R[7:1]:D[0] = A[7:0] * B[0] + D[0] + R[7:0]
+    // Pre: Cy = Ov = 0
+    // Post: Cy = Ov = 0
+
+    mov     QH, [pB]
+    adox    R0, [pD]
+
+    mulx    T1, T0, [pA + 0 * 8]
+    adcx    R0, T0
+    adox    R1, T1
+
+    mulx    T1, T0, [pA + 1 * 8]
+    adcx    R1, T0
+    adox    R2, T1
+
+    mulx    T1, T0, [pA + 2 * 8]
+    adcx    R2, T0
+    adox    R3, T1
+
+    mulx    T1, T0, [pA + 3 * 8]
+    adcx    R3, T0
+    adox    R4, T1
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R4, T0
+    adox    R5, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R5, T0
+    adox    R6, T1
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R6, T0
+    adox    R7, T1
+
+    mulx    T1, T0, [pA + 7 * 8]
+    adcx    R7, T0
+    mov     [pD], R0
+
+    mov     R0, 0
+    adcx    R0, R0
+    adox    R0, T1
+MACRO_END()
+
+MACRO_START(MULADD88, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, pB, T0, T1, QH)
+    // pre & post: Cy = Ov = 0
+    // R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
+
+    MULADD18    R0, R1, R2, R3, R4, R5, R6, R7, pD     , pA, pB     , T0, T1, QH
+    MULADD18    R1, R2, R3, R4, R5, R6, R7, R0, pD +  8, pA, pB +  8, T0, T1, QH
+    MULADD18    R2, R3, R4, R5, R6, R7, R0, R1, pD + 16, pA, pB + 16, T0, T1, QH
+    MULADD18    R3, R4, R5, R6, R7, R0, R1, R2, pD + 24, pA, pB + 24, T0, T1, QH
+    MULADD18    R4, R5, R6, R7, R0, R1, R2, R3, pD + 32, pA, pB + 32, T0, T1, QH
+    MULADD18    R5, R6, R7, R0, R1, R2, R3, R4, pD + 40, pA, pB + 40, T0, T1, QH
+    MULADD18    R6, R7, R0, R1, R2, R3, R4, R5, pD + 48, pA, pB + 48, T0, T1, QH
+    MULADD18    R7, R0, R1, R2, R3, R4, R5, R6, pD + 56, pA, pB + 56, T0, T1, QH
+MACRO_END()
+
+
+MACRO_START(HALF_SQUARE_NODIAG8, R0, R1, R2, R3, R4, R5, R6, R7, pD, pA, T0, T1, QH)
+    // pre & post: Cy = Ov = 0
+    // R[7-0]:D[7-0] = D[7:0] + (A[0:7]^2 - \sum_{i=0}^7 (A[i] * 2^{64*i}) )/2
+    // This is the component of the square that needs to be doubled, and then the diagonals added
+
+    // Note that Dst[0] is not changed by this macro
+
+    mov     QH, [pA + 0 * 8]            // QH = A0
+    mov     R1, [pD + 1 * 8]
+    mov     R2, [pD + 2 * 8]
+    mov     R3, [pD + 3 * 8]
+    mov     R4, [pD + 4 * 8]
+    mov     R5, [pD + 5 * 8]
+    mov     R6, [pD + 6 * 8]
+    mov     R7, [pD + 7 * 8]
+    xor     R0, R0
+
+    mulx    T1, T0, [pA + 1 * 8]
+    adcx    R1, T0
+    adox    R2, T1
+
+    mulx    T1, T0, [pA + 2 * 8]
+    adcx    R2, T0
+    adox    R3, T1
+
+    mulx    T1, T0, [pA + 3 * 8]
+    adcx    R3, T0
+    adox    R4, T1
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R4, T0
+    adox    R5, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R5, T0
+    adox    R6, T1
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R6, T0
+    adox    R7, T1
+
+    mulx    T1, T0, [pA + 7 * 8]
+    adcx    R7, T0
+    mov     [pD + 1 * 8], R1
+
+    adcx    R0, R0
+    adox    R0, T1
+    mov     [pD + 2 * 8], R2
+    mov     QH, [pA + 1 * 8]        // QH = A1
+
+    //=======
+
+    mulx    T1, T0, [pA + 2 * 8]
+    adcx    R3, T0
+    adox    R4, T1
+
+    mulx    T1, T0, [pA + 3 * 8]
+    adcx    R4, T0
+    adox    R5, T1
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R5, T0
+    adox    R6, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R6, T0
+    adox    R7, T1
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R7, T0
+    adox    R0, T1
+
+    mov     QH, [pA + 7 * 8]        // QH = A7
+    mov     R1, 0
+    mov     R2, 0
+    mov     [pD + 3 * 8], R3
+
+    mulx    T1, T0, [pA + 1 * 8]
+    adcx    R0, T0
+    adox    R1, T1                  // doesn't produce Ov as T1 <= 0xff..fe and R1=0
+
+    mulx    T1, T0, [pA + 2 * 8]
+    adcx    R1, T0
+    mov     [pD + 4 * 8], R4
+
+    adcx    R2, T1
+    mov     QH, [pA + 2 * 8]        // QH = A2
+
+    //======
+
+    mulx    T1, T0, [pA + 3 * 8]
+    adcx    R5, T0
+    adox    R6, T1
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R6, T0
+    adox    R7, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R7, T0
+    adox    R0, T1
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R0, T0
+    adox    R1, T1
+
+    mov     QH, [pA + 4 * 8]        // QH = A4
+    mov     R3, 0
+    mov     R4, 0
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R1, T0
+    adox    R2, T1
+
+    mulx    T1,T0, [pA + 6 * 8]
+    adcx    R2, T0
+    adox    R3, T1                  // doesn't produce Ov as T1 <= 0xff..fe and R3=0
+
+    mov     QH, [pA + 5 * 8]        // QH = A5
+    mov     [pD + 5 * 8], R5
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R3, T0
+    adcx    R4, T1
+
+    mov     QH, [pA + 3 * 8]        // QH = A3
+    mov     [pD + 6 * 8], R6
+
+    //======
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R7, T0
+    adox    R0, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R0, T0
+    adox    R1, T1
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R1, T0
+    adox    R2, T1
+
+    mulx    T1, T0, [pA + 7 * 8]
+    adcx    R2, T0
+    adox    R3, T1
+
+    mov     QH, [pA + 7 * 8]        // QH = A7
+    mov     R5, 0
+    mov     R6, 0
+    mov     [pD + 7 * 8], R7
+
+    mulx    T1, T0, [pA + 4 * 8]
+    adcx    R3, T0
+    adox    R4, T1
+
+    mulx    T1, T0, [pA + 5 * 8]
+    adcx    R4, T0
+    adox    R5, T1                  // doesn't produce Ov as T1 <= 0xff..fe and R5=0
+
+    mulx    T1, T0, [pA + 6 * 8]
+    adcx    R5, T0
+    adcx    R6, T1
+
+    xor     R7, R7
+MACRO_END()
+
+MACRO_START(MONTGOMERY18, R0, R1, R2, R3, R4, R5, R6, R7, modInv, pMod, pMont, T0, T1, QH)
+    // Mont[0] = (modinv * R0 mod 2^64)
+    // R0:R[7:1]:<phantom> = Mont[0] * Mod[7:0] + R[7:0]
+    // Pre: -
+    // Post: -
+
+    mov     QH, R0
+    imul    QH, modInv
+
+    // Rather than add the low half of the first mulx to R0 we can go ahead and set
+    // up the Cy flag appropriately based on R0 directly (the addition will always
+    // result in 0 by construction), so we can have the result while imul is running
+
+    // This has a small but measurable perf improvement on SKLX (~2% improvement for
+    // 512b modmul)
+    // and it seems unlikely that it can make the performance worse
+    // My best guess as to why is that allowing this to execute a few cycles early
+    // can reduce port contention when the macro is being speculatively executed
+    or      T0, -1          // Clear Cy and Ov
+    adcx    R0, T0          // Set Cy when R0 is non-zero
+    mov     R0, 0
+    mov     [pMont], QH
+
+    mulx    T1, T1, [pMod + 0 * 8]
+    adox    R1, T1
+
+    mulx    T1, T0, [pMod + 1 * 8]
+    adcx    R1, T0
+    adox    R2, T1
+
+    mulx    T1, T0, [pMod + 2 * 8]
+    adcx    R2, T0
+    adox    R3, T1
+
+    mulx    T1, T0, [pMod + 3 * 8]
+    adcx    R3, T0
+    adox    R4, T1
+
+    mulx    T1, T0, [pMod + 4 * 8]
+    adcx    R4, T0
+    adox    R5, T1
+
+    mulx    T1, T0, [pMod + 5 * 8]
+    adcx    R5, T0
+    adox    R6, T1
+
+    mulx    T1, T0, [pMod + 6 * 8]
+    adcx    R6, T0
+    adox    R7, T1
+
+    mulx    T1, T0, [pMod + 7 * 8]
+    adcx    R7, T0
+
+    adcx    R0, R0
+    adox    R0, T1
+MACRO_END()
+
+MACRO_START(SYMCRYPT_SQUARE_DIAG, index, src_reg, dest_reg, T0, T1, T2, T3, QH)
+    mov     QH, [src_reg + 8 * index]
+    mov     T0, [dest_reg + 16 * index]
+    mov     T1, [dest_reg + 16 * index + 8]
+    mulx    T3, T2, QH
+    adcx    T2, T0
+    adox    T2, T0
+    adcx    T3, T1
+    adox    T3, T1
+    mov     [dest_reg + 16 * index], T2
+    mov     [dest_reg + 16 * index + 8], T3
+MACRO_END()
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawMulMulx(
+//     _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                     UINT32      nDigits1,
+//     _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                     UINT32      nDigits2,
+//     _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawMulMulx, 5, 14)
+
+        shl     Q4, 6
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q4
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], D2
+
+        // First we wipe nDigits2 of the result (size of in)
+        mov     Q6, Q5
+
+        // Wipe destination for nDigit2 blocks
+        xorps   xmm0,xmm0               // Zero register for 16-byte wipes
+        mov     Q0, Q4
+
+SymCryptFdefRawMulMulxWipeLoop:
+        movaps      [Q6],xmm0
+        movaps      [Q6+16],xmm0            // Wipe 32 bytes
+        movaps      [Q6+32],xmm0            // Wipe 32 bytes
+        movaps      [Q6+48],xmm0            // Wipe 32 bytes
+        add         Q6, 64
+        sub         Q0, 64
+        jnz         SymCryptFdefRawMulMulxWipeLoop
+
+
+SymCryptFdefRawMulxOuterLoop:
+
+        ZEROREG_8   Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13      // Leaves Cy = Ov = 0
+
+SymCryptFdefRawMulMulxInnerLoop:
+
+        // Register allocation in loops:
+        // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13        8-word carry
+        // Q0, Q2                                    temps for multiplication
+        // Q1, Q3                                    pSrc1, pSrc2 running pointers
+        // Q4                                        inner loop counter
+        // QH                                        fixed input reg for multiplication
+        // Q5                                        Destination running pointer inner loop
+        // slot0                                     nDigits2*64
+        // slot1                                     outer loop counter
+
+        MULADD88  Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q5, Q1, Q3, Q0, Q2, QH
+
+        add     Q3, 64              // Src2 ptr
+        add     Q5, 64
+
+        sub     D4, 64                            // sets Cy = Ov = 0 because 64*nDigits2 < 2^32
+        jnz     SymCryptFdefRawMulMulxInnerLoop
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q5 + 0*8], Q6
+        mov     [Q5 + 1*8], Q7
+        mov     [Q5 + 2*8], Q8
+        mov     [Q5 + 3*8], Q9
+        mov     [Q5 + 4*8], Q10
+        mov     [Q5 + 5*8], Q11
+        mov     [Q5 + 6*8], Q12
+        mov     [Q5 + 7*8], Q13
+
+        // set up for next iteration
+        // reload 64*nDigits2
+        mov     Q4, [rsp + GET_MEMSLOT_OFFSET(slot0)]
+
+        // reset Q5 & increment
+        sub     Q5, Q4
+        add     Q5, 64
+        // reset Q3
+        sub     Q3, Q4
+
+        // update PSrc1
+        add     Q1, 64
+
+        // nDigits1 loop counter
+        mov     D2, [rsp + GET_MEMSLOT_OFFSET(slot1)]
+        sub     D2, 1                              // sets Cy = Ov = 0 because nDigits1 < 2^32 / 64
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], D2
+        jnz     SymCryptFdefRawMulxOuterLoop
+
+MUL_FUNCTION_END(SymCryptFdefRawMulMulx)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareMulx(
+//     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)         PCUINT32    pSrc,
+//                                                             UINT32      nDigits,
+//     _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawSquareMulx, 3, 14)
+
+        // Q1 = pSrc
+        // Q2 = nDigits
+        // Q3 = pDst
+
+        // Save parameters for phase 2
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1   // save pSrc
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2   // save nDigits
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], Q3   // save pDst
+
+        shl     Q2, 6       // nDigits * 64 = # bytes in Src to process
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)], Q2   // save # bytes in Src to process
+
+        // Wipe destination for nDigits blocks
+        xor     Q0, Q0
+        mov     Q5, Q3
+        mov     Q4, Q2
+
+SymCryptFdefRawSquareMulxWipeLoop:
+        // we use 8-byte writes as we will be reading this very soon in 8-byte chunks, and this way the store-load
+        // forwarding works
+        mov     [Q5     ], Q0
+        mov     [Q5 +  8], Q0
+        mov     [Q5 + 16], Q0
+        mov     [Q5 + 24], Q0
+        mov     [Q5 + 32], Q0
+        mov     [Q5 + 40], Q0
+        mov     [Q5 + 48], Q0
+        mov     [Q5 + 56], Q0
+        add     Q5, 64
+        sub     Q4, 64
+        jnz     SymCryptFdefRawSquareMulxWipeLoop
+
+        // Cy = Ov = 0 here because the last 'sub Q4,64' yielded 0
+
+SymCryptFdefRawSquareMulxOuterLoop:
+
+        HALF_SQUARE_NODIAG8 Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q0, Q4, QH
+
+        sub     Q2, 64
+        jz      SymCryptFdefRawSquareMulxPhase2     // end of phase 1
+
+        lea     Q5, [Q1 + 64]
+        lea     Q3, [Q3 + 64]
+
+SymCryptFdefRawSquareMulxInnerLoop:
+        // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13        8-word carry
+        // Q0, Q4                                    temps for multiplication
+        // Q1                                        pSrc running pointer outer loop
+        // Q2                                        # bytes left in pSrc to process in the inner loop
+        // Q3                                        pDst running pointer inner loop
+        // QH                                        fixed input reg for multiplication
+        // Q5                                        pSrc running pointer inner loop
+
+        // slot0                                     pSrc (used for final pass)
+        // slot1                                     nDigits (used for final pass)
+        // slot2                                     pDst (used for final pass)
+        // slot3                                     # bytes to process from pSrc in this iteration
+
+        MULADD88    Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q3, Q1, Q5, Q0, Q4, QH
+
+        add     Q3, 64
+        add     Q5, 64
+
+        sub     Q2, 64                  // Sets Cy = Ov = 0 because nDigits < 2^32 / bits_per_digit
+        jnz     SymCryptFdefRawSquareMulxInnerLoop
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q3 + 0*8], Q6
+        mov     [Q3 + 1*8], Q7
+        mov     [Q3 + 2*8], Q8
+        mov     [Q3 + 3*8], Q9
+        mov     [Q3 + 4*8], Q10
+        mov     [Q3 + 5*8], Q11
+        mov     [Q3 + 6*8], Q12
+        mov     [Q3 + 7*8], Q13
+
+        mov     Q2, [rsp + GET_MEMSLOT_OFFSET(slot3)]   // restore # bytes in Src to process next
+
+        add     Q1, 64                                  // Shift outer Src pointer by 1 digit
+        sub     Q3, Q2                                  // reset output ptr
+        add     Q3, 128                                 // Shift output ptr by 2 digits
+
+        sub     Q2, 64                                  // Reduce number of bytes to process by 1 digit
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)], Q2
+
+        jmp     SymCryptFdefRawSquareMulxOuterLoop
+
+
+SymCryptFdefRawSquareMulxPhase2:
+        // Cy = Ov = 0 because last 'sub Q2, 64' resulted in 0
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q3 +  8*8], Q6
+        mov     [Q3 +  9*8], Q7
+        mov     [Q3 + 10*8], Q8
+        mov     [Q3 + 11*8], Q9
+        mov     [Q3 + 12*8], Q10
+        mov     [Q3 + 13*8], Q11
+        mov     [Q3 + 14*8], Q12
+        mov     [Q3 + 15*8], Q13
+
+        // Compute diagonals, and add double the result so far
+
+        mov     Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)]
+        mov     Q2, [rsp + GET_MEMSLOT_OFFSET(slot1)]
+        mov     Q3, [rsp + GET_MEMSLOT_OFFSET(slot2)]
+
+        // We can't keep the carries in Cy and Ov because there is no way to do a loop counter
+        // without touching the Ov flag.
+        // So we set the Ov carry in Q0, and retain a zero in Q4
+        xor     Q0, Q0
+        xor     Q4, Q4
+
+SymCryptFdefRawSquareMulxDiagonalsLoop:
+        // Cy = carry in
+        // Q0 = carry in (1 bit)
+        // Ov = 0
+
+        // First word is different to handle the carry
+        // SYMCRYPT_SQUARE_DIAG    0, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        mov     QH, [Q1]
+        mov     Q5, [Q3]
+        mov     Q6, [Q3 + 8]
+        mulx    Q8, Q7, QH
+        adcx    Q7, Q0              // add both carries
+        adcx    Q8, Q4              // Q4 = 0 - now Cy = 0 because result of multiply <= ff..fe00..01
+
+        adcx    Q7, Q5
+        adox    Q7, Q5
+        adcx    Q8, Q6
+        adox    Q8, Q6
+        mov     [Q3], Q7
+        mov     [Q3 + 8], Q8
+
+        SYMCRYPT_SQUARE_DIAG 1, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 2, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 3, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 4, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 5, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 6, Q1, Q3, Q5, Q6, Q7, Q8, QH
+        SYMCRYPT_SQUARE_DIAG 7, Q1, Q3, Q5, Q6, Q7, Q8, QH
+
+        // Move the Ov flag into Q0
+        mov     D0, D4
+        adox    D0, D4
+
+        // There is no way to do a loop counter without overwriting the Ov flag
+        // Even the 'dec' instruction touches it, and LAHF/SAHF doesn't load/store the Ov flag.
+        // We can't push/pop efl in a function body
+
+        lea     Q1, [Q1 + 64]
+        lea     Q3, [Q3 + 128]
+        dec     Q2
+        jnz     SymCryptFdefRawSquareMulxDiagonalsLoop
+
+MUL_FUNCTION_END(SymCryptFdefRawSquareMulx)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefMontgomeryReduceMulx(
+//     _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//     _In_                            PUINT32                 pSrc,
+//     _Out_                           PUINT32                 pDst )
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceMulx, 3, 14)
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q1
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot2)], Q3
+
+        mov     D0, [Q1 + SymCryptModulusNdigitsOffsetAmd64]
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)], D0
+        // CntOuter = nDigits - using first half of slot3
+
+        xor     D4, D4
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3) + 4], D4
+        // HighCarry = 0 - using second half of slot3
+
+SymCryptFdefMontgomeryReduceMulxOuterLoop:
+        // Q1 = pmMod
+        // Q2 = pSrc = tmp buffer that we will reduce
+        mov     Q6, [Q2 + 0 * 8]
+        mov     Q7, [Q2 + 1 * 8]
+        mov     Q8, [Q2 + 2 * 8]
+        mov     Q9, [Q2 + 3 * 8]
+        mov     Q10, [Q2 + 4 * 8]
+        mov     Q11, [Q2 + 5 * 8]
+        mov     Q12, [Q2 + 6 * 8]
+        mov     Q13, [Q2 + 7 * 8]
+
+        mov     Q3, [Q1 + SymCryptModulusMontgomeryInv64OffsetAmd64]            // inv64
+        mov     D4, [Q1 + SymCryptModulusNdigitsOffsetAmd64]
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                      // modulus value
+
+        // Q2 = value to reduce
+        // Q6 - Q13 = Q2[0..7]
+        // Q1 = modulus value
+        // Q3 = modinv
+
+        MONTGOMERY18    Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13,  Q3, Q1, Q2 + (0 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q6,  Q3, Q1, Q2 + (1 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q7,  Q3, Q1, Q2 + (2 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q8,  Q3, Q1, Q2 + (3 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q9,  Q3, Q1, Q2 + (4 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q10,  Q3, Q1, Q2 + (5 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q11,  Q3, Q1, Q2 + (6 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q12,  Q3, Q1, Q2 + (7 * 8), Q0, Q5, QH
+
+        // Q6 - Q13 = carry from multiply-add
+        // Q2[0..7] = Montgomery factors
+
+        mov     Q3, Q2         // factor to multiply by
+        add     Q1, 64
+        add     Q2, 64
+
+        dec     D4
+        jz      SymCryptFdefMontgomeryReduceMulxInnerLoopDone
+
+SymCryptFdefMontgomeryReduceMulxInnerLoop:
+
+        // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13        8-word carry
+        // Q0, Q5                                    temps for multiplication
+        // Q1                                        running pointer pMod inner loop
+        // Q2                                        running pointer pSrc inner loop
+        // Q3                                        Montgomery factors for this row
+        // D4                                        loop ctr
+        // QH                                        fixed input reg for multiplication
+
+        MULADD88    Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13,  Q2, Q1, Q3, Q0, Q5, QH
+            // pre & post: Cy = Ov = 0
+            // Q13..Q6:Q2[7-0] = R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
+            // QH is volatile
+
+        add     Q1, 64
+        add     Q2, 64
+        dec     D4
+        jnz     SymCryptFdefMontgomeryReduceMulxInnerLoop
+
+
+SymCryptFdefMontgomeryReduceMulxInnerLoopDone:
+
+        // We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry
+        // We also saved a 1-bit carry from the previous outer loop
+        mov     D5, [rsp + GET_MEMSLOT_OFFSET(slot3) + 4]
+        // move carry into Cy flag
+        neg     D5
+
+        // We do this in separate instructions to help the instruction decoder build up a lead...
+        mov     Q0, [Q2 + 0 * 8]
+        adc     Q0, Q6
+        mov     [Q2 + 0 * 8], Q0
+
+        mov     Q5, [Q2 + 1 * 8]
+        adc     Q5, Q7
+        mov     [Q2 + 1 * 8], Q5
+
+        mov     Q0, [Q2 + 2 * 8]
+        adc     Q0, Q8
+        mov     [Q2 + 2 * 8], Q0
+
+        mov     Q5, [Q2 + 3 * 8]
+        adc     Q5, Q9
+        mov     [Q2 + 3 * 8], Q5
+
+        mov     Q0, [Q2 + 4 * 8]
+        adc     Q0, Q10
+        mov     [Q2 + 4 * 8], Q0
+
+        mov     Q5, [Q2 + 5 * 8]
+        adc     Q5, Q11
+        mov     [Q2 + 5 * 8], Q5
+
+        mov     Q0, [Q2 + 6 * 8]
+        adc     Q0, Q12
+        mov     [Q2 + 6 * 8], Q0
+
+        mov     Q5, [Q2 + 7 * 8]
+        adc     Q5, Q13
+        mov     [Q2 + 7 * 8], Q5
+
+        adc     D4, D4                // D4 = carry (D4 was previously zero)
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3) + 4], D4
+
+        mov     Q2, [rsp + GET_MEMSLOT_OFFSET(slot1)]
+        add     Q2, 64
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], Q2
+
+        mov     Q1, [rsp + GET_MEMSLOT_OFFSET(slot0)]
+
+        mov     D0, [rsp + GET_MEMSLOT_OFFSET(slot3)]
+        dec     D0
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot3)], D0
+
+        jnz     SymCryptFdefMontgomeryReduceMulxOuterLoop
+
+        // D4 = output carry
+
+        mov     D6, [Q1 + SymCryptModulusNdigitsOffsetAmd64]
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                    // modulus value
+
+        mov     Q3, [rsp + GET_MEMSLOT_OFFSET(slot2)]
+
+        // Q2 = result buffer pointer
+        // D6 = # digits
+        // Q1 = modulus value
+        // Q3 = Dst
+
+        // copy these values for the masked copy loop
+        mov     D7, D6      // nDigits
+        mov     Q8, Q2      // result buffer
+        mov     Q9, Q3      // destination pointer
+
+        // pDst = Reduction result - Modulus
+
+SymCryptFdefMontgomeryReduceMulxSubLoop:
+        mov     Q0,[Q2 + 0 * 8]
+        sbb     Q0,[Q1 + 0 * 8]
+        mov     [Q3 + 0 * 8], Q0
+
+        mov     Q5,[Q2 + 1 * 8]
+        sbb     Q5,[Q1 + 1 * 8]
+        mov     [Q3 + 1 * 8], Q5
+
+        mov     Q0,[Q2 + 2 * 8]
+        sbb     Q0,[Q1 + 2 * 8]
+        mov     [Q3 + 2 * 8], Q0
+
+        mov     Q5,[Q2 + 3 * 8]
+        sbb     Q5,[Q1 + 3 * 8]
+        mov     [Q3 + 3 * 8], Q5
+
+        mov     Q0,[Q2 + 4 * 8]
+        sbb     Q0,[Q1 + 4 * 8]
+        mov     [Q3 + 4 * 8], Q0
+
+        mov     Q5,[Q2 + 5 * 8]
+        sbb     Q5,[Q1 + 5 * 8]
+        mov     [Q3 + 5 * 8], Q5
+
+        mov     Q0,[Q2 + 6 * 8]
+        sbb     Q0,[Q1 + 6 * 8]
+        mov     [Q3 + 6 * 8], Q0
+
+        mov     Q5,[Q2 + 7 * 8]
+        sbb     Q5,[Q1 + 7 * 8]
+        mov     [Q3 + 7 * 8], Q5
+
+        lea     Q2, [Q2 + 64]
+        lea     Q1, [Q1 + 64]
+        lea     Q3, [Q3 + 64]
+        dec     D6
+        jnz     SymCryptFdefMontgomeryReduceMulxSubLoop
+
+        // now a masked copy from the reduction buffer to the destination.
+        // copy if high carry = 0 and Cy = 1
+        sbb     D4, 0
+        // D4 = copy mask, ff...ff  if copy, 0 of no copy
+
+        movd    xmm0, D4           // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop:
+        movdqa  xmm2, [Q8 + 0 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q9 + 0 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q9 + 0 * 16], xmm2
+
+        movdqa  xmm2, [Q8 + 1 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q9 + 1 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q9 + 1 * 16], xmm2
+
+        movdqa  xmm2, [Q8 + 2 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q9 + 2 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q9 + 2 * 16], xmm2
+
+        movdqa  xmm2, [Q8 + 3 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q9 + 3 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q9 + 3 * 16], xmm2
+
+        // Move on to the next digit
+
+        add     Q8, 64
+        add     Q9, 64
+        dec     D7
+        jnz     SymCryptFdefMontgomeryReduceMulxMaskedCopyLoop
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceMulx)
+
+// --------------------------------
+// 1024-bit size specific functions
+// --------------------------------
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMul(
+//    _In_reads_(nWords1)             PCUINT32    pSrc1,
+//    _In_reads_(nWords2)             PCUINT32    pSrc2,
+//                                    UINT32      nDigits,
+//    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawMulMulx1024, 4, 13)
+
+        // First we wipe nDigits of the result (size of in)
+        // Q1 = pSrc1
+        // Q2 = pSrc2
+        // Q3 = nDigits
+        // Q4 = pDst
+
+        // Wipe destination for nDigit2 blocks
+        xorps       xmm0,xmm0               // Zero register for 16-byte wipes
+
+        movaps      [Q4],xmm0
+        movaps      [Q4+16],xmm0            // Wipe 32 bytes
+        movaps      [Q4+32],xmm0            // Wipe 32 bytes
+        movaps      [Q4+48],xmm0            // Wipe 32 bytes
+
+        movaps      [Q4+64],xmm0
+        movaps      [Q4+80],xmm0            // Wipe 32 bytes
+        movaps      [Q4+96],xmm0            // Wipe 32 bytes
+        movaps      [Q4+112],xmm0           // Wipe 32 bytes
+
+        // Digit 1 from src2
+
+        ZEROREG_8   Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12      // Leaves Cy = Ov = 0
+
+        MULADD88  Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH
+
+        add     Q2, 64              // Src2 ptr
+        add     Q4, 64
+        xor     Q0, Q0              // sets Cy = Ov = 0
+
+        MULADD88  Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH
+
+        add     Q4, 64
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q4 + 0*8], Q5
+        mov     [Q4 + 1*8], Q6
+        mov     [Q4 + 2*8], Q7
+        mov     [Q4 + 3*8], Q8
+        mov     [Q4 + 4*8], Q9
+        mov     [Q4 + 5*8], Q10
+        mov     [Q4 + 6*8], Q11
+        mov     [Q4 + 7*8], Q12
+
+        // Digit 2 from src2
+
+        // set up
+
+        // Mov Q4 one digit back
+        sub     Q4, 64
+
+        // reload pSrc2
+        sub     Q2, 64
+
+        // update PSrc1
+        add     Q1, 64
+
+        ZEROREG_8   Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12      // Leaves Cy = Ov = 0
+
+        MULADD88  Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH
+
+        add     Q2, 64              // Src2 ptr
+        add     Q4, 64
+        xor     Q0, Q0              // sets Cy = Ov = 0
+
+        MULADD88  Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q4, Q1, Q2, Q0, Q3, QH
+
+        add     Q4, 64
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q4 + 0*8], Q5
+        mov     [Q4 + 1*8], Q6
+        mov     [Q4 + 2*8], Q7
+        mov     [Q4 + 3*8], Q8
+        mov     [Q4 + 4*8], Q9
+        mov     [Q4 + 5*8], Q10
+        mov     [Q4 + 6*8], Q11
+        mov     [Q4 + 7*8], Q12
+
+MUL_FUNCTION_END(SymCryptFdefRawMulMulx1024)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareMulx1024(
+//     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)         PCUINT32    pSrc,
+//                                                             UINT32      nDigits,
+//     _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PUINT32     pDst )
+
+MUL_FUNCTION_START(SymCryptFdefRawSquareMulx1024, 3, 13)
+
+        // Wipe 128 bytes of destination
+        xorps       xmm0,xmm0               // Zero register for 16-byte wipes
+
+        movaps      [Q3],xmm0
+        movaps      [Q3+16],xmm0
+        movaps      [Q3+32],xmm0
+        movaps      [Q3+48],xmm0
+
+        movaps      [Q3+64],xmm0
+        movaps      [Q3+80],xmm0
+        movaps      [Q3+96],xmm0
+        movaps      [Q3+112],xmm0
+
+        xor     Q0, Q0                      // Sets Cy = Ov = 0
+
+        HALF_SQUARE_NODIAG8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12,  Q3, Q1, Q0, Q2, QH
+
+        lea     Q4, [Q1 + 64]               // Q4 = pSrc + 64
+        lea     Q3, [Q3 + 64]               // Q3 = pDst + 64
+
+        // Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12        8-word carry
+        // Q0, Q2                                   temps for multiplication
+        // Q1                                       pSrc (constant)
+        // Q4                                       pSrc + 64 (constant)
+        // Q3                                       pDst running pointer
+        // QH                                       fixed input reg for multiplication
+
+        MULADD88    Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q3, Q1, Q4, Q0, Q2, QH
+
+        add     Q3, 64                      // Q3 = pDst + 128
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q3 + 0*8], Q5
+        mov     [Q3 + 1*8], Q6
+        mov     [Q3 + 2*8], Q7
+        mov     [Q3 + 3*8], Q8
+        mov     [Q3 + 4*8], Q9
+        mov     [Q3 + 5*8], Q10
+        mov     [Q3 + 6*8], Q11
+        mov     [Q3 + 7*8], Q12
+
+        // Q3 which is the destination pointer is shifted here by 2 digits
+
+        xor     Q0, Q0                        // Sets Cy = Ov = 0
+
+        HALF_SQUARE_NODIAG8 Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12,  Q3, Q4, Q0, Q2, QH
+
+        // Write the 8-word carry-out to the destination
+        mov     [Q3 +  8*8], Q5
+        mov     [Q3 +  9*8], Q6
+        mov     [Q3 + 10*8], Q7
+        mov     [Q3 + 11*8], Q8
+        mov     [Q3 + 12*8], Q9
+        mov     [Q3 + 13*8], Q10
+        mov     [Q3 + 14*8], Q11
+        mov     [Q3 + 15*8], Q12
+
+        // Compute diagonals, and add double the result so far
+
+        sub     Q3, 128         // Q3 = pDst - sets Cy = Ov = 0
+
+        SYMCRYPT_SQUARE_DIAG 0, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 1, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 2, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 3, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 4, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 5, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 6, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 7, Q1, Q3, Q0, Q2, Q4, Q5, QH
+
+        SYMCRYPT_SQUARE_DIAG 8, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 9, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 10, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 11, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 12, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 13, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 14, Q1, Q3, Q0, Q2, Q4, Q5, QH
+        SYMCRYPT_SQUARE_DIAG 15, Q1, Q3, Q0, Q2, Q4, Q5, QH
+
+MUL_FUNCTION_END(SymCryptFdefRawSquareMulx1024)
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefMontgomeryReduceMulx1024(
+//     _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//     _Inout_                         PUINT32                 pSrc,
+//     _Out_                           PUINT32                 pDst )
+MUL_FUNCTION_START(SymCryptFdefMontgomeryReduceMulx1024, 3, 14)
+
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot0)], Q3
+
+        mov     D0, 2
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], D0
+        // CntOuter = nDigits - using first half of slot3
+
+        xor     D4, D4
+        lea     Q1, [Q1 + SymCryptModulusValueOffsetAmd64]                      // modulus value
+
+SymCryptFdefMontgomeryReduceMulx1024OuterLoop:
+        // Q1 = pmMod
+        // Q2 = pSrc = tmp buffer that we will reduce
+        mov     Q6, [Q2 + 0 * 8]
+        mov     Q7, [Q2 + 1 * 8]
+        mov     Q8, [Q2 + 2 * 8]
+        mov     Q9, [Q2 + 3 * 8]
+        mov     Q10, [Q2 + 4 * 8]
+        mov     Q11, [Q2 + 5 * 8]
+        mov     Q12, [Q2 + 6 * 8]
+        mov     Q13, [Q2 + 7 * 8]
+
+        mov     Q3, [Q1 - SymCryptModulusValueOffsetAmd64 + SymCryptModulusMontgomeryInv64OffsetAmd64]            // inv64
+
+        // Q2 = value to reduce
+        // Q6 - Q13 = Q2[0..7]
+        // Q1 = modulus value
+        // Q3 = modinv
+
+        MONTGOMERY18    Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13,  Q3, Q1, Q2 + (0 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q6,  Q3, Q1, Q2 + (1 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q8, Q9, Q10, Q11, Q12, Q13, Q6, Q7,  Q3, Q1, Q2 + (2 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q9, Q10, Q11, Q12, Q13, Q6, Q7, Q8,  Q3, Q1, Q2 + (3 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q10, Q11, Q12, Q13, Q6, Q7, Q8, Q9,  Q3, Q1, Q2 + (4 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q11, Q12, Q13, Q6, Q7, Q8, Q9, Q10,  Q3, Q1, Q2 + (5 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q12, Q13, Q6, Q7, Q8, Q9, Q10, Q11,  Q3, Q1, Q2 + (6 * 8), Q0, Q5, QH
+        MONTGOMERY18    Q13, Q6, Q7, Q8, Q9, Q10, Q11, Q12,  Q3, Q1, Q2 + (7 * 8), Q0, Q5, QH
+
+        // Q6 - Q13 = carry from multiply-add
+        // Q2[0..7] = Montgomery factors
+
+        mov     Q3, Q2         // factor to multiply by
+        add     Q1, 64
+        add     Q2, 64
+
+        // Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13        8-word carry
+        // Q0, Q5                                    temps for multiplication
+        // Q1                                        running pointer pMod inner loop
+        // Q2                                        running pointer pSrc inner loop
+        // Q3                                        Montgomery factors for this row
+        // D4                                        loop ctr
+        // QH                                        fixed input reg for multiplication
+
+        MULADD88    Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13,  Q2, Q1, Q3, Q0, Q5, QH
+            // pre & post: Cy = Ov = 0
+            // Q13..Q6:Q2[7-0] = R[7-0]:D[7-0] = A[7:0] * B[7:0] + R[7:0] + D[7:0]
+            // QH is volatile
+
+        add     Q1, 64
+        add     Q2, 64
+
+        // We have an 8-word carry here, which we need to add to the in-memory buffer and retain a carry
+        // We also saved a 1-bit carry from the previous outer loop in D4
+        // move carry into Cy flag
+        neg     D4
+        mov     D4, 0
+
+        // We do this in separate instructions to help the instruction decoder build up a lead...
+        mov     Q0, [Q2 + 0 * 8]
+        adc     Q0, Q6
+        mov     [Q2 + 0 * 8], Q0
+
+        mov     Q5, [Q2 + 1 * 8]
+        adc     Q5, Q7
+        mov     [Q2 + 1 * 8], Q5
+
+        mov     Q0, [Q2 + 2 * 8]
+        adc     Q0, Q8
+        mov     [Q2 + 2 * 8], Q0
+
+        mov     Q5, [Q2 + 3 * 8]
+        adc     Q5, Q9
+        mov     [Q2 + 3 * 8], Q5
+
+        mov     Q0, [Q2 + 4 * 8]
+        adc     Q0, Q10
+        mov     [Q2 + 4 * 8], Q0
+
+        mov     Q5, [Q2 + 5 * 8]
+        adc     Q5, Q11
+        mov     [Q2 + 5 * 8], Q5
+
+        mov     Q0, [Q2 + 6 * 8]
+        adc     Q0, Q12
+        mov     [Q2 + 6 * 8], Q0
+
+        mov     Q5, [Q2 + 7 * 8]
+        adc     Q5, Q13
+        mov     [Q2 + 7 * 8], Q5
+
+        adc     D4, D4                  // D4 = carry (D4 was previously zero)
+
+        sub     Q2, 64                  // Q2 = tmp buffer that we will reduce (64B are now zeroed)
+        sub     Q1, 128                 // Q1 = modulus value
+
+        mov     D0, [rsp + GET_MEMSLOT_OFFSET(slot1)]
+        sub     D0, 1
+        mov     [rsp + GET_MEMSLOT_OFFSET(slot1)], D0
+
+        jnz     SymCryptFdefMontgomeryReduceMulx1024OuterLoop
+
+        // D4 = output carry
+
+        mov     Q3, [rsp + GET_MEMSLOT_OFFSET(slot0)]
+
+        // Q2 = result buffer pointer
+        // Q1 = modulus value
+        // Q3 = Dst
+
+        // pDst = Reduction result - Modulus
+
+        mov     Q0,[Q2 + 0 * 8]
+        sbb     Q0,[Q1 + 0 * 8]
+        mov     [Q3 + 0 * 8], Q0
+
+        mov     Q5,[Q2 + 1 * 8]
+        sbb     Q5,[Q1 + 1 * 8]
+        mov     [Q3 + 1 * 8], Q5
+
+        mov     Q0,[Q2 + 2 * 8]
+        sbb     Q0,[Q1 + 2 * 8]
+        mov     [Q3 + 2 * 8], Q0
+
+        mov     Q5,[Q2 + 3 * 8]
+        sbb     Q5,[Q1 + 3 * 8]
+        mov     [Q3 + 3 * 8], Q5
+
+        mov     Q0,[Q2 + 4 * 8]
+        sbb     Q0,[Q1 + 4 * 8]
+        mov     [Q3 + 4 * 8], Q0
+
+        mov     Q5,[Q2 + 5 * 8]
+        sbb     Q5,[Q1 + 5 * 8]
+        mov     [Q3 + 5 * 8], Q5
+
+        mov     Q0,[Q2 + 6 * 8]
+        sbb     Q0,[Q1 + 6 * 8]
+        mov     [Q3 + 6 * 8], Q0
+
+        mov     Q5,[Q2 + 7 * 8]
+        sbb     Q5,[Q1 + 7 * 8]
+        mov     [Q3 + 7 * 8], Q5
+
+        mov     Q0,[Q2 + 8 * 8]
+        sbb     Q0,[Q1 + 8 * 8]
+        mov     [Q3 + 8 * 8], Q0
+
+        mov     Q5,[Q2 + 9 * 8]
+        sbb     Q5,[Q1 + 9 * 8]
+        mov     [Q3 + 9 * 8], Q5
+
+        mov     Q0,[Q2 + 10 * 8]
+        sbb     Q0,[Q1 + 10 * 8]
+        mov     [Q3 + 10 * 8], Q0
+
+        mov     Q5,[Q2 + 11 * 8]
+        sbb     Q5,[Q1 + 11 * 8]
+        mov     [Q3 + 11 * 8], Q5
+
+        mov     Q0,[Q2 + 12 * 8]
+        sbb     Q0,[Q1 + 12 * 8]
+        mov     [Q3 + 12 * 8], Q0
+
+        mov     Q5,[Q2 + 13 * 8]
+        sbb     Q5,[Q1 + 13 * 8]
+        mov     [Q3 + 13 * 8], Q5
+
+        mov     Q0,[Q2 + 14 * 8]
+        sbb     Q0,[Q1 + 14 * 8]
+        mov     [Q3 + 14 * 8], Q0
+
+        mov     Q5,[Q2 + 15 * 8]
+        sbb     Q5,[Q1 + 15 * 8]
+        mov     [Q3 + 15 * 8], Q5
+
+        // now a masked copy from the reduction buffer to the destination.
+        // copy if high carry = 0 and Cy = 1
+        sbb     D4, 0
+        // D4 = copy mask, ff...ff  if copy, 0 of no copy
+
+        movd    xmm0, D4            // xmm0[0] = mask
+        pcmpeqd xmm1, xmm1          // xmm1 = ff...ff
+        pshufd  xmm0, xmm0, 0       // xmm0[0..3] = mask
+        pxor    xmm1, xmm0          // xmm1 = not Mask
+
+
+        movdqa  xmm2, [Q2 + 0 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 0 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 0 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 1 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 1 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 1 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 2 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 2 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 2 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 3 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 3 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 3 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 4 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 4 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 4 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 5 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 5 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 5 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 6 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 6 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 6 * 16], xmm2
+
+        movdqa  xmm2, [Q2 + 7 * 16]    // xmm2 = pSrc[0]
+        movdqa  xmm3, [Q3 + 7 * 16]    // xmm3 = pDst[0]
+        pand    xmm2, xmm0
+        pand    xmm3, xmm1
+        por     xmm2, xmm3
+        movdqa  [Q3 + 7 * 16], xmm2
+
+MUL_FUNCTION_END(SymCryptFdefMontgomeryReduceMulx1024)
+
+FILE_END()
diff --git a/lib/amd64/sha1asm.asm b/lib/amd64/sha1asm.asm
deleted file mode 100644
index 526c3c1..0000000
--- a/lib/amd64/sha1asm.asm
+++ /dev/null
@@ -1,423 +0,0 @@
-;
-; Sha1Asm.Asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;
-
-;
-;   This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
-;   for the x64 processor architecture.
-;
-;   This implementation is derived from the 32-bit one, which in turn is derived
-;   from an older one by Scott Field and Dan Shumow. 
-;
-
-include ksamd64.inc
-
-        TITLE   sha1asm.asm
-
-        ;
-        ; The four round constants used by SHA-1
-        ;
-        
-K0_19   EQU     05a827999H
-K20_39  EQU     06ed9eba1H
-K40_59  EQU     08f1bbcdcH
-K60_79  EQU     0ca62c1d6H
-
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptSha1AppendBlocks( _Inout_updates_( 5 )        PUINT32    H,
-;                            _In_reads_bytes_( cbData )    PCBYTE    pbData,
-;                                                     SIZE_T    cbData )
-;
-
-        ;
-        ; This function allocates stack space, so it is not a LEAF function
-        ; but a nested one.
-        ;
-        NESTED_ENTRY    SymCryptSha1AppendBlocksAsm, _TEXT
-                
-;
-; To keep stack manipulations simple we define a structure and use that for all accesses.
-;
-
-SymCryptSha1AppendBlocksFrame struct  16, NONUNIQUE
-;
-; To keep the RSP aligned we need (8 mod 16) bytes of local stack space. 
-; this is the case, so there is no need for a dummy location
-;
-Wbuf            dd      16 dup (?)
-EndAddress      dq      ?
-SaveR12         dq      ?
-SaveR13         dq      ?
-SaveR14         dq      ?
-SaveR15         dq      ?
-SaveRdi         dq      ?
-SaveRsi         dq      ?
-SaveRbp         dq      ?
-SaveRbx         dq      ?
-ReturnAddress   dq      ?
-CallerP1Home    dq      ?
-CallerP2Home    dq      ?
-CallerP3Home    dq      ?
-CallerP4Home    dq      ?
-
-SymCryptSha1AppendBlocksFrame ends
-
-        ;
-        ; We use the W buffer extensively; this is a shorthand for the base address
-        ;
-W       equ     rsp+SymCryptSha1AppendBlocksFrame.Wbuf
-
-
-
-        ;
-        ; Set up our stack frame and save non-volatile registers
-        ;
-        rex_push_reg    rbx
-        push_reg        rbp
-        push_reg        rsi
-        push_reg        rdi
-        push_reg        r15
-        push_reg        r14
-        push_reg        r13
-        push_reg        r12
-        alloc_stack     SymCryptSha1AppendBlocksFrame.SaveR12
-        
-        END_PROLOGUE
-
-        ;
-        ;Register allocation:
-        ;
-        ;5 registers for state
-        ;2 scratch
-        ;6 registers for W[t-1], W[t-2], W[t-3], W[t-14], W[t-15], W[t-16]
-        ;1 for data pointer
-        ;1 for H pointer
-        ;
-        ;
-        ; To allow macro re-ordering of our registers we use symbolic names
-        ; for the registers.
-        ; s0-s4 are the 5 state registers. x1 and x2 are extra scratch registers.
-        ; w0-w5 contain the W state cache
-        ;
-        ; Note: some other code puts the right value in the right register and
-        ; has to be updated if this mapping is changed.
-        ;
-        ; a is in register (round   % 5)
-        ; b is in register (round+4 % 5)
-        ; c is in register (round+3 % 5)
-        ; d is in register (round+2 % 5)
-        ; e is in register (round+1 % 5)
-        ; This way, if round is incremented we move a->b, b->c, c->d, d->e, and e->a
-        ; For optimization the actual value of a is in scratch register x1 at the start of each round
-        ;
-        ; W[t- 1] is in register (round   % 6)
-        ; W[t- 2] is in register (round+5 % 6)
-        ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
-        ; W[t-14] is in register (round+3 % 6)
-        ; W[t-15] is in register (round+2 % 6)
-        ; W[t-16] is in register (round+1 % 6)
-        ; If round is incremented the values all appear in their right place.
-        
-s0      EQU     eax
-s1      EQU     ebx
-s2      EQU     ecx
-s3      EQU     edx
-s4      EQU     esi
-
-w0      EQU     r9d
-w1      EQU     r10d
-w2      EQU     r11d
-w3      EQU     r12d
-w4      EQU     r13d
-w5      EQU     r14d
-
-x1      EQU     ebp     ; screatch 1
-x2      EQU     edi     ; scratch 2
-
-dataPtr EQU     r8      ; Points to data buffer
-HPtr    EQU     r15     ; Points to H
-
-
-        ; At this point:
-        ;       rcx = H
-        ;       rdx = pbData
-        ;       r8  = cbData
-        ;
-        ; compute the end address, address of byte after last block we will process
-        ; This code ensures that we never exceed the data buffer we were given,
-        ; although we silently round the cbData parameter down to the next
-        ; multiple of 64.
-        ; Do nothing if no blocks need to be processed.
-        ;
-        and     r8,NOT 3fh                      ; round down to multiple of 64
-        jz      SymCryptSha1AppendBlocksDone
-        add     r8,rdx                          ; pbData + (cbData & 0x3f)
-        mov     [rsp+SymCryptSha1AppendBlocksFrame.EndAddress], r8
-
-        mov     dataPtr,rdx
-        mov     Hptr,rcx
-                
-        ;
-        ; Load the H state, note that the a value lives in x1 at the round code boundary
-        ;
-        mov     x1,[Hptr   ]
-        mov     s4,[Hptr+ 4]
-        mov     s3,[Hptr+ 8]
-        mov     s2,[Hptr+12]
-        mov     s1,[Hptr+16]
-        
-        
-SymCryptSha1AppendBlocksLoop:
-        ;
-        ; This is the main loop. We process 64 bytes in each iteration.
-        ;
-        ; Most of the code in the loop is generated through macros using parameters to
-        ; rename the registers.
-        ;
-        
-ROUND_CH_0_15   MACRO   round,sa,sb,sc,sd,se,wt,x1,x2
-        ;
-        ; Code for round 0-15.
-        ; This code loads data from the data buffer & BSWAPs the data to get it into the
-        ; right form.
-        ;
-        ; Parameters:
-        ; round round number
-        ; sa    register that will contain the a value
-        ; sb    register that contains the b value
-        ; sc    register that contains the c value
-        ; sd    register that contains the d value
-        ; se    register that contains the e value
-        ; x1    scratch, contains the a value on entry
-        ; x2    scratch register.
-        ; wt    register loaded with Wt
-        ; 
-        ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
-        ; We start with the d value as that is the oldest value and available the first
-        ;
-        ; See FIPS 180-2 for our symbolic notation.
-        ;
-        mov     x2,sd                   ; x2 = d
-        mov     wt,[dataPtr+4*round]    ; Fetch word from message
-        mov     sa, x1                  ; put a in the correct register
-
-        bswap   wt                      ; wt = Wt
-        xor     x2,sc                   ; x2 = (d ^ c)
-        rol     x1,5                    ; x1 = ROL(a,5)
-
-        add     se,wt                   ; se = e + Wt
-        and     x2,sb                   ; x2 = ((d ^ c) & b)
-        mov     [W + 4*round],wt        ; Store in W buffer for future use
-        ror     sb,2                    ; sb = ROL( b, 30 )
-
-        add     se,x1                   ; se = e + Wt + ROL(a,5)
-        xor     x2,sd                   ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        
-        lea     x1,[se+x2+K0_19]        ; x1 = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-                
-        ENDM
-
-MSG_EXP         MACRO   round, se, wa, wb, wc
-        ; round round number
-        ; se    register of state to add expanded message word to
-        ; wa    register of W[round-16], will be updated to contain W[round]
-        ; wb    register of W[round-14]
-        ; wc    register of W[round- 3], will be loaded with W[round-13]
-
-        xor     wc, wb                          ; wc = W[t-3] ^ W[t-14]
-        xor     wa,[W+4*((round-8) MOD 16)]     ; wa = W[t-16] ^ W[t-8]
-        xor     wa, wc                          ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     wa,1                            ; wa = Wt
-        IF      round LT (80 - 1)
-                ; do not load wc with W[t-13] in the last round; it will not be needed
-                mov     wc,[W+4*((round-13) MOD 16)]    ; wc = W[t-13]
-        ENDIF
-        add     se,wa                           ; re = e + Wt
-        IF      round LT (80 - 8)
-                ; don't store Wt in the last 8 rounds. The value would never be used
-                mov     [W+4*(round MOD 16)], wa; Store Wt
-        ENDIF
-        ENDM
-
-ROUND_CH        MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
-        ;
-        ; See ROUND_CH_0_15 for most parameters.
-        ; x1 and x2 are both scratch registers
-        ; wa    register of W[round-16], will be updated to contain W[round]
-        ; wb    register of W[round-14]
-        ; wc    register of W[round- 3], will be loaded with W[round-13]
-        ;
-        
-        xor     wc, wb                          ; wc = W[t-3] ^ W[t-14]
-        xor     wa,[W+4*((round-8) MOD 16)]     ; wa = W[t-16] ^ W[t-8]
-        xor     wa, wc                          ; wa = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     wa,1                            ; wa = Wt
-        mov     wc,[W+4*((round-13) MOD 16)]    ; wc = W[t-13]
-        add     se,wa                           ; re = e + Wt
-        mov     [W+4*(round MOD 16)], wa        ; Store Wt
-        
-        mov     sa, x1                          ; put a in the correct register
-        mov     x2,sd                           ; x2 = d
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,sc                           ; x2 = (d ^ c)
-        add     se,x1                           ; re = e + Wt + ROL(a,5)
-        and     x2,sb                           ; x2 = ((d ^ c) & b)
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        xor     x2,sd                           ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        lea     x1,[se+x2+K0_19]                ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-        ENDM
-
-ROUND_PARITY    MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2, K
-        ;
-        ; See ROUND_CH for most parameters
-        ; K is the round constant to use.
-        ;
-        ; The order of xorring the registers b, c, and d is driven by the data dependency graph.
-        ; We start with d (the oldest) and then do b to unblock the subsequent rotate
-        ;
-        MSG_EXP         round, se, wa, wb, wc   ; re = e + Wt
-
-        mov     sa,x1                           ; store a value in right register
-        rol     x1,5                            ; x1 = ROL(a,5)
-        add     se,x1                           ; re = e + Wt + ROL(a,5)
-        
-        mov     x2,sd                           ; x1 = d
-        xor     x2,sb                           ; x1 = (d ^ b)
-        xor     x2,sc                           ; x1 = (d ^ b ^ c) = Parity(b,c,d)
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        lea     x1,[se+x2+K]                    ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
-
-                ENDM
-
-ROUND_MAJ       MACRO   round, sa, sb, sc, sd, se, wa, wb, wc, x1, x2
-        ;
-        ; See above for parameter explanation
-        ;
-        MSG_EXP         round, se, wa, wb, wc   ; re = e + Wt
-        
-        mov     sa,x1                           ; store a value in right register
-        rol     x1,5                            ; x1 = ROL(a,5)
-        add     se,x1                           ; re = e + ROL(a,5)
-        mov     x1,sd                           ; x1 = d
-        or      x1,sc                           ; x1 = (d | c)
-        and     x1,sb                           ; x1 = ((d | c) & b)
-
-        mov     x2,sc                           ; x2 = c
-        and     x2,sd                           ; x2 = (c & d)
-        or      x1,x2                           ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
-        
-        ror     sb,2                            ; rb = ROL( b, 30 )
-        
-        lea     x1,[se+x1+K40_59]               ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
-        ENDM
-
-
-        ;
-        ; With these macros we can now produce the actual code.
-        ; Note the use of the % operator which evaluates the expression and yields the result as text.
-        ; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
-        ; for each round.
-        ;
-        ; The first 16 rounds are more complicated as we need to use the right registers to load the msg in
-        ; so we do those by hand
-        ;
-        ; W[t- 1] is in register (round   % 6)
-        ; W[t- 2] is in register (round+5 % 6)
-        ; W[t- 3] is in register (round+4 % 6) (is loaded with W[t-13] in each round)
-        ; W[t-14] is in register (round+3 % 6)
-        ; W[t-15] is in register (round+2 % 6)
-        ; W[t-16] is in register (round+1 % 6)
-        ;
-        ROUND_CH_0_15    0, s0, s4, s3, s2, s1, w5, x1, x2      ;W[t-16] for t=16 is in w5
-        ROUND_CH_0_15    1, s1, s0, s4, s3, s2, w0, x1, x2      ;W[t-15] for t=16 is in w0
-        ROUND_CH_0_15    2, s2, s1, s0, s4, s3, w1, x1, x2      ;W[t-14] for t=16 is in w1
-        ROUND_CH_0_15    3, s3, s2, s1, s0, s4, w3, x1, x2      ;
-        ROUND_CH_0_15    4, s4, s3, s2, s1, s0, w4, x1, x2      ;
-        ROUND_CH_0_15    5, s0, s4, s3, s2, s1, w3, x1, x2      ;
-        ROUND_CH_0_15    6, s1, s0, s4, s3, s2, w4, x1, x2      ;
-        ROUND_CH_0_15    7, s2, s1, s0, s4, s3, w3, x1, x2      ;
-        ROUND_CH_0_15    8, s3, s2, s1, s0, s4, w4, x1, x2      ;
-        ROUND_CH_0_15    9, s4, s3, s2, s1, s0, w3, x1, x2      ;
-        ROUND_CH_0_15   10, s0, s4, s3, s2, s1, w4, x1, x2      ;
-        ROUND_CH_0_15   11, s1, s0, s4, s3, s2, w3, x1, x2      ;
-        ROUND_CH_0_15   12, s2, s1, s0, s4, s3, w4, x1, x2      ;
-        ROUND_CH_0_15   13, s3, s2, s1, s0, s4, w2, x1, x2      ;W[t-3] for t=16 is in w2
-        ROUND_CH_0_15   14, s4, s3, s2, s1, s0, w3, x1, x2      ;W[t-2] for t=16 is in w3
-        ROUND_CH_0_15   15, s0, s4, s3, s2, s1, w4, x1, x2      ;W[t-1] for t=16 is in w4
-
-        
-        FOR     t, <16, 17, 18, 19>
-          ROUND_CH      t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
-        ENDM
-        
-        FOR     t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
-          ROUND_PARITY  t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K20_39
-        ENDM
-
-        FOR     t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
-          ROUND_MAJ     t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2
-        ENDM
-
-        FOR     t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79>
-          ROUND_PARITY  t, s%(t MOD 5), s%((t+4) MOD 5), s%((t+3) MOD 5), s%((t+2) MOD 5), s%((t+1) MOD 5), w%((t+1) MOD 6), w%((t+3) MOD 6), w%((t+4) MOD 6), x1, x2, K60_79
-        ENDM
-        
-        ;
-        ; Now we update the state, & the dataPtr
-        ;
-        add     x1,[Hptr   ]
-        add     s4,[Hptr+ 4]
-        add     dataPtr,64
-        add     s3,[Hptr+ 8]
-        add     s2,[Hptr+12]
-        add     s1,[Hptr+16]
-        
-        mov     [Hptr   ], x1
-        mov     [Hptr+ 4], s4
-        cmp     dataPtr,[rsp+SymCryptSha1AppendBlocksFrame.EndAddress]  ; Loop terminating condition
-        mov     [Hptr+ 8], s3
-        mov     [Hptr+12], s2
-        mov     [Hptr+16], s1
-
-        jc      SymCryptSha1AppendBlocksLoop            ; Main loop
-        
-        ;
-        ; We're done processing the blocks. The result is already in the state, so all we have to do
-        ; is clean up.
-        ;
-        ; Wipe the W buffer
-        ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
-        ;
-        mov     rcx,64
-        xor     rax,rax
-@@:     sub     ecx,16
-        mov     [rsp+rcx  ],rax
-        mov     [rsp+rcx+8],rax
-        jnz     @B
-        
-SymCryptSha1AppendBlocksDone:   
-
-
-        add     rsp, SymCryptSha1AppendBlocksFrame.SaveR12
-
-        BEGIN_EPILOGUE
-        pop     r12
-        pop     r13
-        pop     r14
-        pop     r15
-        pop     rdi
-        pop     rsi
-        pop     rbp
-        pop     rbx
-
-        ret
-        
-        NESTED_END      SymCryptSha1AppendBlocksAsm, _TEXT
-
-END
-
diff --git a/lib/amd64/symcrypt_magic.inc b/lib/amd64/symcrypt_magic.inc
deleted file mode 100644
index 7dbdaaa..0000000
--- a/lib/amd64/symcrypt_magic.inc
+++ /dev/null
@@ -1,37 +0,0 @@
-;
-; SymCrypt_magic.inc
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-; Include file to define the support macros for the Magic field
-;
-
-        extern  SymCryptFatal:NEAR
-
-
-SYMCRYPT_MAGIC_FIELD    MACRO
-
-        if      DBG
-        magic   dq      ?
-        endif
-        
-                        ENDM
-
-SYMCRYPT_CODE_VERSION   EQU     ((SYMCRYPT_CODE_VERSION_API SHL 16) OR SYMCRYPT_CODE_VERSION_MINOR )
-SYMCRYPT_MAGIC_CONSTANT EQU     ('S1mv' + SYMCRYPT_CODE_VERSION)
-
-SYMCRYPT_CHECK_MAGIC    MACRO   ptr, struct_name
-
-        if      DBG
-
-        mov     rax, [ptr + struct_name.magic]
-        sub     rax, ptr
-        cmp     rax, SYMCRYPT_MAGIC_CONSTANT
-        jz      @F
-        mov     ecx, 'magc'
-        call    SymCryptFatal
-@@:
-        endif
-
-        ENDM
-        
-
diff --git a/lib/amd64/wipe.asm b/lib/amd64/wipe.asm
deleted file mode 100644
index 19175f0..0000000
--- a/lib/amd64/wipe.asm
+++ /dev/null
@@ -1,171 +0,0 @@
-;
-; Wipe.asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-include ksamd64.inc
-
-        TITLE   wipe.asm
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptWipe( _Out_writes_bytes_( cbData )   PVOID  pbData,
-;                                       SIZE_T cbData )
-
-        ;
-        ; This function allocates no stack space, calls no functions, and does not save
-        ; any non-volatile registers. Thusm it is a LEAF function
-        ;
-        LEAF_ENTRY      SymCryptWipeAsm, _TEXT
-
-        ; rcx = pbData
-        ; rdx = cbData
-
-        ;       
-        ; This function will handle any alignment of pbData and any size, but it is optimized for 
-        ; the case where the start and end of the buffer are 16-aligned.
-        ; 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
-        ; of 16 long without adding too much slack. 
-        ; The cost of non-alignment is relatively low, in the order of 5 cycles or so
-        ;
-
-        xorps   xmm0,xmm0               ; Zero register for 16-byte wipes
-        cmp     rdx,16
-        jb      SymCryptWipeAsmSmall    ; if cbData < 16, this is a rare case
-        
-        test    rcx,15
-        jnz     SymCryptWipeAsmUnAligned; if data pointer is unaligned, we jump to the code that aligns the pointer
-                                        ; For well-optimized callers the aligned case is the common one, and that is
-                                        ; the fall-through.
-        
-SymCryptWipeAsmAligned:
-        ;
-        ; Here rcx is aligned, and rdx contains the # bytes left to wipe, and rdx >= 16
-        ;
-        ; Our loop wipes in 32-byte increments; we always wipe the first 16 bytes if
-        ; and increment the pbData pointer if cbData is 16 mod 32
-        ; This avoids a conditional jump and is faster.
-        ;
-        test    rdx,16                  
-        movaps  [rcx],xmm0              ; it is safe to always wipe as cbData >= 16   
-        lea     r8,[rcx+16]             
-        cmovnz  rcx,r8                  ; only increment pbData if cbData = 16 mod 32
-        
-        sub     rdx,32                  ; see if we have >= 32 bytes to wipe
-        jc      SymCryptWipeAsmTailOptional ; if not, wipe tail, or nothing if cbData = 0 mod 16
-        
-        align   16
-        
-SymCryptWipeAsmLoop:
-        movaps  [rcx],xmm0
-        movaps  [rcx+16],xmm0           ; Wipe 32 bytes
-        add     rcx,32
-        sub     rdx,32               
-        jnc     SymCryptWipeAsmLoop
-        
-SymCryptWipeAsmTailOptional:
-        ; only the lower 4 bits of rdx are valid, we have subtracted too much already.
-        ; The wipe was at least 16 bytes, so we can just wipe the tail in one instruction
-        
-        and     edx,15
-        jnz     SymCryptWipeAsmTail
-        ret
-        
-SymCryptWipeAsmTail:
-        ; This code appears also below at the end of the unaligned wiping routine
-        ; but making the jnz jump further is slower and we only duplicate 4 instructions.
-        xor     eax,eax
-        mov     [rcx+rdx-16],rax
-        mov     [rcx+rdx-8],rax       
-        ret
-
-        align   4
-SymCryptWipeAsmUnaligned:
-
-        ;
-        ; At this point we know that cbData(rdx) >= 16 and pbData(rcx) is unaligned. 
-        ; We can wipe 16 bytes and move to an aligned position
-        ; 
-        xor     eax,eax
-        mov     [rcx],rax
-        mov     [rcx+8],rax
-        
-        mov     eax,ecx                 ; 
-        neg     eax                     ; lower 4 bits of eax = # bytes to wipe to reach alignment
-        and     eax,15
-        add     rcx,rax
-        sub     rdx,rax
-
-        ;
-        ; If rdx > 16, go to the aligned wiping loop
-        ;        
-        cmp     rdx,16
-        jae      SymCryptWipeAsmAligned  ; if cbData >= 16, do aligned wipes
-        
-        ;
-        ; We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
-        ; We just wipe the last 16 bytes completely.
-        ;
-        xor     eax,eax
-        mov     [rcx+rdx-16],rax
-        mov     [rcx+rdx-8],rax       
-        ret
-        
-
-        align   8
-SymCryptWipeAsmSmall:
-        ; rcx = pbData, possibly unaligned
-        ; rdx = cbData; rdx < 16
-        ;
-        ; With speculative execution attacks, the cost of a jump table is prohibitive.
-        ; We use a compare ladder for 5 cases:
-        ;       8-15 bytes
-        ;       4-7 bytes
-        ;       2-3 bytes
-        ;       1 byte
-        ;       0 bytes
-        
-        xor     eax,eax
-
-        cmp     edx, 8
-        jb      SymCryptWipeAsmSmallLessThan8
-
-        ; wipe 8-15 bytes using two possibly overlapping writes
-        mov     [rcx], rax
-        mov     [rcx + rdx - 8], rax
-        ret
-
-SymCryptWipeAsmSmallLessThan8:
-        cmp     edx, 4
-        jb      SymCryptWipeAsmSmallLessThan4
-
-        ; wipe 4-7 bytes
-        mov     [rcx], eax
-        mov     [rcx + rdx - 4], eax
-        ret
-
-SymCryptWipeAsmSmallLessThan4:
-        cmp     edx, 2
-        jb      SymCryptWipeAsmSmallLessThan2
-
-        ; wipe 2-3 bytes
-         mov    [rcx], ax
-         mov    [rcx + rdx - 2], ax
-         ret
-
-SymCryptWipeAsmSmallLessThan2:
-        or      edx, edx
-        jz      SymCryptWipeAsmSmallDone
-
-        ; wipe 1 byte
-        mov     [rcx], al
-
-SymCryptWipeAsmSmallDone:
-
-        ret                 
-
-        LEAF_END        SymCryptWipeAsm, _TEXT
-       
-END
-
diff --git a/lib/amd64/wipe.symcryptasm b/lib/amd64/wipe.symcryptasm
new file mode 100644
index 0000000..22225b0
--- /dev/null
+++ b/lib/amd64/wipe.symcryptasm
@@ -0,0 +1,165 @@
+//
+//  wipe.symcryptasm   Assembler code for wiping a buffer
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+
+#include "symcryptasm_shared.cppasm"
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptWipe( _Out_writes_bytes_( cbData )    PVOID  pbData,
+//                                              SIZE_T cbData )
+
+FUNCTION_START(SymCryptWipeAsm, 2, 4)
+
+        // Q1 = pbData
+        // Q2 = cbData
+
+        //
+        // This function will handle any alignment of pbData and any size, but it is optimized for
+        // the case where the start and end of the buffer are 16-aligned.
+        // 16 is the natural stack alignment on AMD64, and structures can be designed to be a multiple
+        // of 16 long without adding too much slack.
+        // The cost of non-alignment is relatively low, in the order of 5 cycles or so
+        //
+
+        xorps   xmm0,xmm0               // Zero register for 16-byte wipes
+        cmp     Q2,16
+        jb      SymCryptWipeAsmSmall    // if cbData < 16, this is a rare case
+
+        test    Q1,15
+        jnz     SymCryptWipeAsmUnaligned // if data pointer is unaligned, we jump to the code that aligns the pointer
+                                        // For well-optimized callers the aligned case is the common one, and that is
+                                        // the fall-through.
+
+SymCryptWipeAsmAligned:
+        //
+        // Here Q1 is aligned, and Q2 contains the # bytes left to wipe, and Q2 >= 16
+        //
+        // Our loop wipes in 32-byte increments; we always wipe the first 16 bytes
+        // and increment the pbData pointer if cbData is 16 mod 32
+        // This avoids a conditional jump and is faster.
+        //
+        test    Q2,16
+        movaps  [Q1],xmm0               // it is safe to always wipe as cbData >= 16
+        lea     Q3,[Q1+16]
+        cmovnz  Q1,Q3                   // only increment pbData if cbData = 16 mod 32
+
+        sub     Q2,32                   // see if we have >= 32 bytes to wipe
+        jc      SymCryptWipeAsmTailOptional // if not, wipe tail, or nothing if cbData = 0 mod 16
+
+ALIGN(16)
+
+SymCryptWipeAsmLoop:
+        movaps  [Q1],xmm0
+        movaps  [Q1+16],xmm0            // Wipe 32 bytes
+        add     Q1,32
+        sub     Q2,32
+        jnc     SymCryptWipeAsmLoop
+
+SymCryptWipeAsmTailOptional:
+        // only the lower 4 bits of Q2 are valid, we have subtracted too much already.
+        // The wipe was at least 16 bytes, so we can just wipe the tail with 2 instructions
+
+        and     D2,15
+        jnz     SymCryptWipeAsmTail
+        ret
+
+SymCryptWipeAsmTail:
+        // This code appears also below at the end of the unaligned wiping routine
+        // but making the jnz jump further is slower and we only duplicate 4 instructions.
+        xor     D0,D0
+        mov     [Q1+Q2-16],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+ALIGN(4)
+
+SymCryptWipeAsmUnaligned:
+
+        //
+        // At this point we know that cbData(Q2) >= 16 and pbData(Q1) is unaligned.
+        // We can wipe 16 bytes and move to an aligned position
+        //
+        xor     D0,D0
+        mov     [Q1],Q0
+        mov     [Q1+8],Q0
+
+        mov     D0,D1
+        neg     D0                      // lower 4 bits of D0 = # bytes to wipe to reach alignment
+        and     D0,15
+        add     Q1,Q0
+        sub     Q2,Q0
+
+        //
+        // If Q2 > 16, go to the aligned wiping loop
+        //
+        cmp     Q2,16
+        jae     SymCryptWipeAsmAligned  // if cbData >= 16, do aligned wipes
+
+        //
+        // We have <= 16 bytes to wipe, and we know that the full wipe region was at least 16 bytes.
+        // We just wipe the last 16 bytes completely.
+        //
+        xor     D0,D0
+        mov     [Q1+Q2-16],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+ALIGN(8)
+
+SymCryptWipeAsmSmall:
+        // Q1 = pbData, possibly unaligned
+        // Q2 = cbData; Q2 < 16
+        //
+        // With speculative execution attacks, the cost of a jump table is prohibitive.
+        // We use a compare ladder for 5 cases:
+        //       8-15 bytes
+        //       4-7 bytes
+        //       2-3 bytes
+        //       1 byte
+        //       0 bytes
+
+        xor     D0,D0
+
+        cmp     D2, 8
+        jb      SymCryptWipeAsmSmallLessThan8
+
+        // wipe 8-15 bytes using two possibly overlapping writes
+        mov     [Q1],Q0
+        mov     [Q1+Q2-8],Q0
+        ret
+
+SymCryptWipeAsmSmallLessThan8:
+        cmp     D2, 4
+        jb      SymCryptWipeAsmSmallLessThan4
+
+        // wipe 4-7 bytes
+        mov     [Q1],D0
+        mov     [Q1+Q2-4],D0
+        ret
+
+SymCryptWipeAsmSmallLessThan4:
+        cmp     D2, 2
+        jb      SymCryptWipeAsmSmallLessThan2
+
+        // wipe 2-3 bytes
+        mov     [Q1],W0
+        mov     [Q1+Q2-2],W0
+        ret
+
+SymCryptWipeAsmSmallLessThan2:
+        or      D2,D2
+        jz      SymCryptWipeAsmSmallDone
+
+        // wipe 1 byte
+        mov     [Q1],B0
+
+SymCryptWipeAsmSmallDone:
+
+FUNCTION_END(SymCryptWipeAsm)
+
+FILE_END()
diff --git a/lib/arm/fdef_asm.asm b/lib/arm/fdef_asm.asm
index 0d6e459..e514928 100644
--- a/lib/arm/fdef_asm.asm
+++ b/lib/arm/fdef_asm.asm
@@ -9,7 +9,11 @@
 #include "symcrypt_version.inc"
 #include "symcrypt_magic.inc"
 
+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM
 
 ; A digit consists of 4 words of 32 bits each
 
@@ -449,11 +453,11 @@ SymCryptFdefRawSquareAsmInnerLoopInit_Word1
 
     SQR_SINGLEADD_32    3
 
-  
+
     add     r2, r2, #16
     add     r4, r4, #16
 
-    adds    r3, r3, #1                  ; move one digit up  
+    adds    r3, r3, #1                  ; move one digit up
     bne     SymCryptFdefRawSquareAsmInnerLoopInit_Word0
 
     str     r11, [r4]                   ; Store the next word into the destination
@@ -689,7 +693,7 @@ SymCryptFdefMontgomeryReduceAsmInner
     adds    r11, r11, r7                ; c + pSrc[nWords] + hc
     adc     r8, r8, #0                  ; Add the carry if any
     str     r11, [r1], #4               ; pSrc[nWords] = c
-    
+
     adds    r12, r12, r6                ; c + pSrc[nWords+1]
     adc     r9, r9, #0                  ; Add the carry if any
     adds    r12, r12, r8                ; c + pSrc[nWords] + hc
@@ -701,7 +705,7 @@ SymCryptFdefMontgomeryReduceAsmInner
     add     r2, r2, #8                  ; Move stored pSrc pointer two words up
     ldr     r0, [sp, #pMod]             ; Restore the pMod pointer
     mov     r1, r2                      ; Restore the pSrc pointer
-    
+
     bne     SymCryptFdefMontgomeryReduceAsmOuter
 
     ;
diff --git a/lib/arm64/fdef369_asm.asm b/lib/arm64/fdef369_asm.asm
index bb75673..11d4985 100644
--- a/lib/arm64/fdef369_asm.asm
+++ b/lib/arm64/fdef369_asm.asm
@@ -16,7 +16,11 @@
 #include "symcrypt_name_mangling.inc"
 #include "symcrypt_magic.inc"
 
+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM
 
 ; A digit consists of 3 words of 64 bits each
 
@@ -213,7 +217,7 @@ SymCryptFdef369RawMulAsmLoopInner1
     adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
     umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
     str     x12, [x4], #8               ; Store to destination
-    
+
     cbnz    x3, SymCryptFdef369RawMulAsmLoopInner1
 
     adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
diff --git a/lib/arm64/fdef_asm.asm b/lib/arm64/fdef_asm.asm
index 5dcd9c4..8897ba9 100644
--- a/lib/arm64/fdef_asm.asm
+++ b/lib/arm64/fdef_asm.asm
@@ -10,7 +10,11 @@
 #include "symcrypt_name_mangling.inc"
 #include "symcrypt_magic.inc"
 
+; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
+; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
+#define SYMCRYPT_MASM
 #include "C_asm_shared.inc"
+#undef SYMCRYPT_MASM
 
 ; A digit consists of 4 words of 64 bits each
 
diff --git a/lib/fdef_general.c b/lib/fdef_general.c
index 4c4383a..0a79476 100644
--- a/lib/fdef_general.c
+++ b/lib/fdef_general.c
@@ -517,11 +517,11 @@ SymCryptFdefIntSetValueUint64(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawSetValue(
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
-                                SIZE_T                  cbSrc,
-                                SYMCRYPT_NUMBER_FORMAT  format,
-    _Out_writes_(nWords)        PUINT32                 pDst,
-                                UINT32                  nDigits )
+    _In_reads_bytes_(cbSrc)                             PCBYTE                  pbSrc,
+                                                        SIZE_T                  cbSrc,
+                                                        SYMCRYPT_NUMBER_FORMAT  format,
+    _Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32                 pDst,
+                                                        UINT32                  nDigits )
 {
     SYMCRYPT_ERROR scError;
     UINT32  b;
@@ -611,11 +611,11 @@ SymCryptFdefIntSetValue(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawGetValue(
-    _In_reads_(nWords)          PCUINT32                pSrc,
-                                UINT32                  nDigits,
-    _Out_writes_bytes_(cbBytes) PBYTE                   pbDst,
-                                SIZE_T                  cbDst,
-                                SYMCRYPT_NUMBER_FORMAT  format )
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32                pSrc,
+                                                        UINT32                  nDigits,
+    _Out_writes_bytes_(cbBytes)                         PBYTE                   pbDst,
+                                                        SIZE_T                  cbDst,
+                                                        SYMCRYPT_NUMBER_FORMAT  format )
 {
     SYMCRYPT_ERROR scError;
     UINT32  b;
diff --git a/lib/fdef_int.c b/lib/fdef_int.c
index dd5d636..7fbbb23 100644
--- a/lib/fdef_int.c
+++ b/lib/fdef_int.c
@@ -722,11 +722,11 @@ SymCryptFdefIntSquare(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulC(
-    _In_reads_(nWords1)             PCUINT32    pSrc1,
-                                    UINT32      nDigits1,
-    _In_reads_(nWords2)             PCUINT32    pSrc2,
-                                    UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
+    _In_reads_(nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32)              PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32)              PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
 {
     UINT32 nWords1 = nDigits1 * SYMCRYPT_FDEF_DIGIT_NUINT32;
     UINT32 nWords2 = nDigits2 * SYMCRYPT_FDEF_DIGIT_NUINT32;
@@ -778,9 +778,9 @@ SymCryptFdefRawMul(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquareC(
-    _In_reads_(nWords)              PCUINT32    pSrc,
-                                    UINT32      nDigits,
-    _Out_writes_(2*nWords)          PUINT32     pDst )
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc,
+                                                        UINT32      nDigits,
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst )
 {
     UINT32 nWords = nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32;
 
diff --git a/lib/fdef_mod.c b/lib/fdef_mod.c
index 3e3b7f0..07b6e43 100644
--- a/lib/fdef_mod.c
+++ b/lib/fdef_mod.c
@@ -1223,7 +1223,7 @@ SymCryptFdefModMulMontgomery(
     SymCryptFdefMontgomeryReduce( pmMod, pTmp, &peDst->d.uint32[0] );
 }
 
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModMulMontgomeryMulx(
@@ -1283,7 +1283,7 @@ SymCryptFdefModSquareMontgomery(
 }
 
 
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModSquareMontgomeryMulx(
@@ -1356,70 +1356,12 @@ SymCryptFdefModInvMontgomery(
     return scError;
 }
 
-#if SYMCRYPT_CPU_AMD64 && SYMCRYPT_MS_VC
+#if SYMCRYPT_CPU_AMD64
 
 //=====================================
 // 256-bit Montgomery modulus code
 //
 
-VOID
-SYMCRYPT_CALL
-SymCryptFdefModAdd256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch )
-{
-    SYMCRYPT_ASYM_ALIGN BYTE    buf1[128];
-    SYMCRYPT_ASYM_ALIGN BYTE    buf2[128];
-    PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-    PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-
-    (VOID) peTmp1;
-    (VOID) peTmp2;
-
-    SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
-    SymCryptFdefModAddGeneric( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch );
-
-    if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
-    {
-        SymCryptFatal( 42 );
-    }
-
-    SymCryptFdefModAdd256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefModMulMontgomery256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   peSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    peDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch )
-{
-    SYMCRYPT_ASYM_ALIGN BYTE    buf1[128];
-    SYMCRYPT_ASYM_ALIGN BYTE    buf2[128];
-    PSYMCRYPT_MODELEMENT peTmp1 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf1 ), sizeof( buf1 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-    PSYMCRYPT_MODELEMENT peTmp2 = SymCryptModElementCreate( SYMCRYPT_ASYM_ALIGN_UP( buf2 ), sizeof( buf2 ) - SYMCRYPT_ASYM_ALIGN_VALUE, pmMod );
-
-    (VOID) peTmp1;
-    (VOID) peTmp2;
-
-    SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peTmp1, pbScratch, cbScratch );
-    //SymCryptFdefModMulMontgomery( pmMod, peSrc1, peSrc2, peTmp2, pbScratch, cbScratch ); *** This doesn't produce the same result as it reduces a whole digit, not 256 bits
-
-    if( memcmp( peTmp1, peTmp2, 64 ) != 0 )
-    {
-    //    SymCryptFatal( 42 );
-    }
-
-    SymCryptFdefModMulMontgomery256Asm( pmMod, peSrc1, peSrc2, peDst, pbScratch, cbScratch );
-}
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModSquareMontgomery256(
diff --git a/lib/i386/fdef_asm.asm b/lib/i386/fdef_asm.cppasm
similarity index 94%
rename from lib/i386/fdef_asm.asm
rename to lib/i386/fdef_asm.cppasm
index 1597236..d9663ce 100644
--- a/lib/i386/fdef_asm.asm
+++ b/lib/i386/fdef_asm.cppasm
@@ -1,5 +1,6 @@
 ;
-;  fdef_asm.asm     Assembler code for fast arithmetic 
+;  fdef_asm.cppasm     Assembler code for fast arithmetic
+;  Requires C preprocessor to correctly include C_asm_shared.inc
 ;
 ; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
 ;
@@ -11,9 +12,9 @@
 ;
 ; FPO documentation:
 ; The .FPO provides debugging information.
-; This stuff not well documented, 
+; This stuff not well documented,
 ; but here is the information I've gathered about the arguments to .FPO
-; 
+;
 ; In order:
 ; cdwLocals: Size of local variables, in DWords
 ; cdwParams: Size of parameters, in DWords. Given that this is all about
@@ -23,7 +24,7 @@
 ;            prolog code with work for better performance. Most uses of
 ;            .FPO seem to set this value to 0 anyway, which is what we
 ;            will do.
-; cbRegs   : # registers saved in the prolog. 
+; cbRegs   : # registers saved in the prolog.
 ; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
 ; cbFrame  : Type of frame.
 ;            0 = FPO frame (no frame pointer)
@@ -43,7 +44,7 @@ _TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
 include symcrypt_version.inc
 include symcrypt_magic.inc
 
-include C_asm_shared.inc
+#include "C_asm_shared.inc"
 
         PUBLIC  @SymCryptFdefRawAddAsm@16
         PUBLIC  @SymCryptFdefRawSubAsm@16
@@ -60,7 +61,7 @@ BEFORE_PROC     MACRO
         ;
         DB      5 dup (0cch)
                 ENDM
-        
+
 
 
 
@@ -86,7 +87,7 @@ pDst            dd      ?
 nDigits         dd      ?
 
 SymCryptFdefRawAddAsmFrame ends
-        
+
         ; ecx = pSrc1
         ; edx = pSrc2
 
@@ -129,7 +130,7 @@ SymCryptFdefRawAddAsmLoop:
         pop     edi
         pop     ebx
         ret     8
-        
+
 @SymCryptFdefRawAddAsm@16 ENDP
 
 
@@ -154,7 +155,7 @@ pDst            dd      ?
 nDigits         dd      ?
 
 SymCryptFdefRawSubAsmFrame ends
-        
+
         ; ecx = pSrc1
         ; edx = pSrc2
 
@@ -197,7 +198,7 @@ SymCryptFdefRawSubAsmLoop:
         pop     edi
         pop     ebx
         ret     8
-        
+
 @SymCryptFdefRawSubAsm@16 ENDP
 
 
@@ -305,8 +306,8 @@ SymCryptFdefRawMulAsmFrame        ends
         ;   for each word in Src1:
         ;       Dst += Src2 * word
         ; Register assignments
-        ; 
-        ;   eax = tmp/lower half of mult 
+        ;
+        ;   eax = tmp/lower half of mult
         ;   ebx = multiplicant
         ;   ecx = loop counter, initialized to nDigits2
         ;   edx = upper half of mult
@@ -315,7 +316,7 @@ SymCryptFdefRawMulAsmFrame        ends
         ;   ebp = carry
         ;
         ;   esp + pSrc1     running pointer into Src1
-        ;   esp + 
+        ;   esp +
 
 
         mov     edi,edi
@@ -436,7 +437,7 @@ SymCryptFdefRawMulAsmLoop2:
         adc     edx, 0
         mov     [edi + 12], eax
         mov     ebp, edx
-        
+
         add     esi, 16
         add     edi, 16
         sub     ecx,1
@@ -477,7 +478,7 @@ SymCryptFdefMontgomeryReduceAsmFrame struct 4, NONUNIQUE
         HighCarry       dd  ?
         pSrc            dd  ?
         pModValue       dd  ?
-        nWords          dd  ?           
+        nWords          dd  ?
         SaveEbp         dd  ?       ; # words still to process in outer loop
         SaveEsi         dd  ?
         SaveEdi         dd  ?
@@ -513,13 +514,13 @@ SymCryptFdefMontgomeryReduceAsmFrame        ends
 SymCryptFdefMontgomeryReduceOuterLoop:
         ; eax = <undef>
         ; ebx = <undef>
-        ; ecx = <undef> 
+        ; ecx = <undef>
         ; edx = <undef>
         ; esi = start of mod value
         ; edi = pSrc + 4 * loop iteration count
         ; ebp = <undef>
 
-        ; compute multiplier for this outer loop iteration. 
+        ; compute multiplier for this outer loop iteration.
         mov     ebx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusMontgomeryInv64OffsetX86 ]
         imul    ebx, [edi]              ; word we want to zero out, ebx = multiplier for this inner loop
 
@@ -529,7 +530,7 @@ SymCryptFdefMontgomeryReduceOuterLoop:
 SymCryptFdefMontgomeryReduceInnerLoop:
         ; eax = mul scratch
         ; ebx = multiplier
-        ; ecx = digit counter 
+        ; ecx = digit counter
         ; edx = mul scratch
         ; esi = running pointer to mod value
         ; edi = running pointer to input/scratch
@@ -570,7 +571,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
         adc     edx, 0
         mov     [edi + 12], eax
         mov     ebp, edx
-        
+
         add     esi, 16
         add     edi, 16
         sub     ecx,1
@@ -606,7 +607,7 @@ SymCryptFdefMontgomeryReduceInnerLoop:
 
         mov     ecx, [esi - SymCryptModulusValueOffsetX86 + SymCryptModulusNdigitsOffsetX86]    ; loop counter
         mov     edx, [esp + SymCryptFdefMontgomeryReduceAsmFrame.pDst];
-        
+
         ; ecx = nDigits
 
         ; Save some values for the copy loop
diff --git a/lib/i386/rc4asm.asm b/lib/i386/rc4asm.asm
deleted file mode 100644
index de8f6c5..0000000
--- a/lib/i386/rc4asm.asm
+++ /dev/null
@@ -1,314 +0,0 @@
-;
-;     rc4asm.asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;       RC4 implementation in x86 assembler
-;       This is a new RC4 implementation for SymCrypt.
-;       It is NOT based on the existing one in RSA32.lib.
-;
-
-
-        TITLE   "RC4"
-        .586P
-
-_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
-        ASSUME  CS:_TEXT, DS:FLAT, SS:FLAT
-
-include symcrypt_version.inc
-include symcrypt_magic.inc
-
-;
-; Structure definition that mirrors the SYMCRYPT_RC4_STATE struct
-;
-        
-RC4_STATE struct
-        S               db      256 dup (?)
-        i               db      ?
-        j               db      ?
-
-        SYMCRYPT_MAGIC_FIELD
-        
-RC4_STATE ends
-
-        
-        PUBLIC  @SymCryptRc4InitAsm@12
-        PUBLIC  @SymCryptRc4CryptAsm@16
-
-
-BEFORE_PROC     MACRO
-        ;
-        ; Our current x86 compiler inserts 5 0xcc bytes before every function
-        ; and starts every function with a 2-byte NOP.
-        ; This supports hot-patching.
-        ;
-        DB      5 dup (0cch)
-                ENDM
-
-
-; The .FPO provides debugging information.
-; This stuff not well documented, 
-; but here is the information I've gathered about the arguments to .FPO
-; 
-; In order:
-; cdwLocals: Size of local variables, in DWords
-; cdwParams: Size of parameters, in DWords. Given that this is all about
-;            stack stuff, I'm assuming this is only about parameters passed
-;            on the stack.
-; cbProlog : Number of bytes in the prolog code. We have interleaved the
-;            prolog code with work for better performance. Most uses of
-;            .FPO seem to set this value to 0 anyway, which is what we
-;            will do.
-; cbRegs   : # registers saved in the prolog. 4 in our case
-; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
-; cbFrame  : Type of frame.
-;            0 = FPO frame (no frame pointer)
-;            1 = Trap frame (result of a CPU trap event)
-;            2 = TSS frame
-;
-; Having looked at various occurrences of .FPO in the Windows code it
-; seems to be used fairly sloppy, with lots of arguments left 0 even when
-; they probably shouldn't be according to the spec.
-;
-
-
-
-        BEFORE_PROC
-        
-@SymCryptRc4InitAsm@12   PROC
-;VOID
-;SYMCRYPT_CALL
-;SymCryptRc4InitAsm( 
-;    _Out_                   PSYMCRYPT_RC4_STATE pState,
-;    _In_reads_bytes_( cbKey )    PCBYTE              pbKey,
-;    _In_                    SIZE_T              cbKey );
-;
-; NOTE: Unlike the SymCryptRc4Init function 
-; this function does not check the cbKey validity, and does not return an error code.
-; Currently we don't have the error code values symbolically in the asm environment.
-; We use an inlined function to generate the errors instead, and call this function
-; only when there are no errors.
-;
-
-Rc4InitFrame struct  4, NONUNIQUE
-
-pbKey           dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-cbKey           dd      ?
-
-Rc4InitFrame ends
-
-        .FPO(5,1,0,4,0,0)
-
-        ; ecx = pState
-        ; edx = pKey
-        ; [esp + 4] = cbKey
-
-        ;
-        ; Set up stack frame, and initialize pbKey
-        ;
-        mov     edi,edi         ; 2-byte NOP for hot-patching
-        
-        push    ebx
-        push    ebp
-        push    esi
-        push    edi
-        push    edx
-
-        ;
-        ; Initialize S[i] = i
-        ;
-        lea     esi,[ecx + 100h]
-        mov     edi,ecx
-        
-        mov     eax,03020100h
-        mov     ebx,04040404h
-
-@@:
-        mov     [edi],eax
-        add     eax,ebx
-        mov     [edi+4],eax
-        add     eax,ebx
-        mov     [edi+8],eax
-        add     eax,ebx
-        mov     [edi+12],eax
-        add     eax,ebx
-        add     edi,16
-        cmp     edi,esi
-        jb      @B
-
-
-        mov     ebp,edx         
-        xor     ebx,ebx         ; j = 0
-        xor     esi,esi         ; i = 0 
-        mov     edi,[esp + Rc4InitFrame.cbKey]  
-        add     edi, edx        ; edi = pbKey + cbKey
-        
-SymCryptRc4InitMainLoop:
-        ; Registers:
-        ; eax = Tmp1
-        ; ebx = j
-        ; ecx = S
-        ; edx = Tmp2
-        ; esi = i       
-        ; edi = keyLimit        ; just beyond the key
-        ; ebp = pKey    ; pointer to current key byte
-
-        movzx   edx,byte ptr[ebp]       ; get key byte
-        add     ebx,edx                 ; j += key byte
-        movzx   eax,byte ptr[ecx + esi] ; get S[i]
-        add     ebx,eax                 ; j += S[i]
-        
-        and     ebx,0ffh
-
-        movzx   edx,byte ptr [ecx + ebx]; get S[j]
-        mov     byte ptr[ecx + ebx], al ; update S[j]
-        mov     byte ptr[ecx + esi], dl ; update S[i]
-
-        add     ebp,1                   ; increment key pointer modulo key length
-        cmp     ebp,edi
-        jb      @F
-        mov     ebp,[esp + Rc4InitFrame.pbKey]
-@@:
-
-        add     esi,1                   ; increment i
-        cmp     esi,100h
-        jb      SymCryptRc4InitMainLoop
-
-        mov     word ptr [ecx + RC4_STATE.i], 1 ; i = 1; j = 0
-
-        add     esp,4
-        pop     edi
-        pop     esi
-        pop     ebp
-        pop     ebx
-        ret     4
-
-        
-@SymCryptRc4InitAsm@12   ENDP
-
-
-
-
-        BEFORE_PROC
-
-@SymCryptRc4CryptAsm@16         PROC
-;VOID
-;SYMCRYPT_CALL
-;SymCryptRc4Crypt( 
-;    _Inout_                 PSYMCRYPT_RC4_STATE pState,
-;    _In_reads_bytes_( cbData )   PCBYTE              pbSrc,
-;    _Out_writes_bytes_( cbData )  PBYTE               pbDst,
-;    _In_                    SIZE_T              cbData )
-
-Rc4CryptFrame struct  4, NONUNIQUE
-pbEndDst        dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-pbDst           dd      ?
-cbData          dd      ?
-
-Rc4CryptFrame ends
-
-        .FPO(5,2,0,4,0,0)
-
-
-        mov     edi,edi
-
-        push    ebx
-        push    ebp
-        push    esi
-        push    edi
-        sub     esp,4
-
-        SYMCRYPT_CHECK_MAGIC    ecx, RC4_STATE
-
-        mov     eax,[esp + Rc4CryptFrame.cbData]
-        test    eax,eax
-        jz      Rc4CryptDoNothing
-
-        mov     ebp,[esp + Rc4CryptFrame.pbDst]
-        add     eax,ebp
-        mov     [esp + Rc4CryptFrame.pbEndDst], eax
-        
-        mov     edi, edx
-        movzx   edx,[ecx + RC4_STATE.i]
-        movzx   esi,[ecx + RC4_STATE.j]
-
-        ;
-        ; Further perf improvements are possible.
-        ; Instead of encrypting byte-by-byte, we can collect 4 bytes of the key
-        ; stream in a register, and then encrypt 4 bytes at a time.
-        ; This reduces the # memory operations we do per byte.
-        ; Ideally this is done with aligned operations, either
-        ; aligning to pbSrc, pbDst, or to i (which removes the need to increment i every time).
-        ; 
-
-@@:     
-        ; eax   Ti
-        ; ebx   Tj
-        ; ecx   S
-        ; edx   i
-        ; esi   j
-        ; edi   pSrc
-        ; ebp   pDst
-
-        movzx   eax, byte ptr[ecx + edx]        ; Ti = S[i]
-        
-        ;add    esi, eax
-        ;and    esi, 0ffh
-        lea     ebx, [esi + eax]
-        movzx   esi, bl                         ; j += Ti
-        
-        movzx   ebx, byte ptr[ecx + esi]        ; Tj = S[j]
-        mov     [ecx + edx], bl                 ; S[i] = Tj
-        mov     [ecx + esi], al                 ; S[j] = Ti
-        
-        ;add    eax,ebx
-        ;and    eax,0ffh
-        lea     eax,[eax + ebx]                 
-        movzx   eax,al                          ; Ti = Ti + Tj
-        
-        mov     al,[ecx + eax]                  ; Til = S[Ti]
-
-        ;add    edx, 1
-        ;and    0ffh
-        lea     edx,[edx + 1]
-        movzx   edx,dl                          ; i += 1
-        
-        xor     al,[edi]
-        add     edi,1
-        mov     [ebp],al
-        add     ebp, 1
-
-        cmp     ebp,[esp + Rc4CryptFrame.pbEndDst]
-        jb      @B
-
-        mov     eax, esi
-        mov     [ecx + RC4_STATE.i], dl
-        mov     [ecx + RC4_STATE.j], al
-
-Rc4CryptDoNothing:
-        
-        add     esp,4
-        pop     edi
-        pop     esi
-        pop     ebp
-        pop     ebx
-        ret     8
-
-
-@SymCryptRc4CryptAsm@16         ENDP
-        
-
-
-_TEXT   ENDS
-        
-        END
diff --git a/lib/i386/sha1asm.asm b/lib/i386/sha1asm.asm
deleted file mode 100644
index 5de211d..0000000
--- a/lib/i386/sha1asm.asm
+++ /dev/null
@@ -1,383 +0,0 @@
-;
-; Sha1Asm.Asm
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-;
-
-;
-;   This module implements the bulk processing of the FIPS 180-1 SHA message digest algorithm.
-;   for the x86 processor architecture.
-;
-;   This implementation is derived from an older one by Scott Field and
-;   Dan Shumow. 
-;     
-;   This implementation is optimized for Intel Core and contemporary AMD CPUs.
-;   Optimizations for pre-P3 Intel CPUs has been removed.
-;
-
-
-        TITLE   sha1asm.asm
-        .486
-
-_TEXT   SEGMENT PARA PUBLIC USE32 'CODE'
-        ASSUME  CS:_TEXT, DS:FLAT, SS:FLAT
-
-        PUBLIC  @SymCryptSha1AppendBlocksAsm@12
-
-        ;
-        ; The four round constants used by SHA-1
-        ;
-        
-K0_19   EQU     05a827999H
-K20_39  EQU     06ed9eba1H
-K40_59  EQU     08f1bbcdcH
-K60_79  EQU     0ca62c1d6H
-
-        align   16
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptSha1AppendBlocks( _Inout_updates_( 5 )        PUINT32    H,
-;                            _In_reads_bytes_( cbData )    PCBYTE    pbData,
-;                                                    SIZE_T    cbData )
-;
-@SymCryptSha1AppendBlocksAsm@12    PROC
-
-;
-; To keep stack manipulatins simple we define a structure and use that for all accesses.
-;
-SymCryptSha1AppendBlocksFrame struct  4, NONUNIQUE
-
-Wbuf            dd      16 dup (?)
-Hptr            dd      ?
-pbData          dd      ?
-BlockCount      dd      ?
-SaveEdi         dd      ?
-SaveEsi         dd      ?
-SaveEbp         dd      ?
-SaveEbx         dd      ?
-ReturnAddress   dd      ?
-CbData          dd      ?
-
-SymCryptSha1AppendBlocksFrame ends
-
-        ;
-        ; We use the W buffer extensively; this is a shorthand for the base address
-        ;
-W       equ     esp+SymCryptSha1AppendBlocksFrame.Wbuf
-
-        ;
-        ; The .FPO provides debugging information for stack frames that do not use
-        ; ebp as a base pointer.
-        ; This stuff not well documented, 
-        ; but here is the information I've gathered about the arguments to .FPO
-        ; 
-        ; In order:
-        ; cdwLocals: Size of local variables, in DWords
-        ; cdwParams: Size of parameters, in DWords. Given that this is all about
-        ;            stack stuff, I'm assuming this is only about parameters passed
-        ;            on the stack.
-        ; cbProlog : Number of bytes in the prolog code. We sometimes interleaved the
-        ;            prolog code with work for better performance. Most uses of
-        ;            .FPO seem to set this value to 0.
-        ;            The debugger seems to work if the prolog defined by this value
-        ;            contains all the stack adjustments.
-        ; cbRegs   : # registers saved in the prolog. 4 in our case
-        ; fUseBP   : 0 if EBP is not used as base pointer, 1 if EBP is used as base pointer
-        ; cbFrame  : Type of frame.
-        ;            0 = FPO frame (no frame pointer)
-        ;            1 = Trap frame (result of a CPU trap event)
-        ;            2 = TSS frame
-        ;
-        ; Having looked at various occurrences of .FPO in the Windows code it
-        ; seems to be used fairly sloppy, with lots of arguments left 0 even when
-        ; they probably shouldn't be according to the spec.
-        ;
-        .FPO(23,1,3,4,0,0)      ; 3 byte prolog (covers esp ajustment only)
-        
-        ; At this point:
-        ;       ecx = H
-        ;       edx = pbData
-        ;       [esp+4] = cbData
-
-        ;
-        ; Set up our stack frame and save non-volatile registers
-        ;
-        sub     esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEbp],ebp
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEdi],edi
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEsi],esi
-        mov     [esp+SymCryptSha1AppendBlocksFrame.SaveEbx],ebx
-
-        mov     [esp+SymCryptSha1AppendBlocksFrame.Hptr], ecx
-
-        ;
-        ; To allow macro re-ordering of our registers we use symbolic names
-        ; for the registers.
-        ; r0-r4 are the 5 state registers. x1 and x2 are extra scratch registers.
-        ; Note: some prolog code puts the right value in the right register and
-        ; has to be updated if this mapping is changed.
-        ;
-r0      EQU     eax
-r1      EQU     ebx
-r2      EQU     ecx
-r3      EQU     edx
-r4      EQU     esi
-x1      EQU     ebp
-x2      EQU     edi
-
-        ;
-        ; compute how many blocks we will process.
-        ; This code ensures that we never exceed the data buffer we were given,
-        ; although we silently round the cbData parameter down to the next
-        ; multiple of 64.
-        ; Do nothing if no blocks need to be processed.
-        ;
-        mov     eax,[esp+SymCryptSha1AppendBlocksFrame.CbData]
-        shr     eax,6
-        jz      SymCryptSha1AppendBlocksDone
-        mov     [esp+SymCryptSha1AppendBlocksFrame.BlockCount], eax
-        
-        ;
-        ; The data pointer goes into x1 = ebp at the start of our loop
-        ;
-        mov     ebp,edx
-
-        ;
-        ; Load the H state from [ecx], making sure we load the r2=ecx register
-        ; last.
-        ;
-        mov     r0,[ecx   ]
-        mov     r4,[ecx+ 4]
-        mov     r3,[ecx+ 8]
-        mov     r1,[ecx+16]
-        mov     r2,[ecx+12]
-        
-        
-SymCryptSha1AppendBlocksLoop:
-        ;
-        ; This is the main loop. We process 64 bytes in each iteration.
-        ; invariant: ebp = pbData
-        ;
-        
-        ;
-        ; Most of the code in the loop is generated through macros using parameters to
-        ; rename the registers.
-        ; The macros get the register number passed as parameter. They use
-        ; "r&<param>" to paste the number and the 'r' together and get the register
-        ; name we defined above.
-        ;
-        
-ROUND_CH_0_15   MACRO   round,ra,rb,rc,rd,re,x1,x2
-        ;
-        ; Code for round 0-15.
-        ; This code loads data from the data buffer & BSWAPs the data to get it into the
-        ; right form.
-        ;
-        ; Parameters:
-        ; round round number
-        ; ra    register number that contains the a value
-        ; rb    register number that contains the b value
-        ; rc    register number that contains the c value
-        ; rd    register number that contains the d value
-        ; re    register number that contains the e value
-        ; x1    pointer to the input data
-        ; x2    scratch register.
-        ; 
-        ; We use the formula CH(b,c,d) = ((d ^ c) & b) ^ c which uses only one temp register.
-        ; We start with the d value as that is the oldest value and available the first
-        ;
-        ; See FIPS 180-2 for our symbolic notation.
-        ;
-        mov     x2,[x1+4*round]         ; Fetch word from message
-        bswap   x2                      ; x2 = Wt
-        add     r&re,x2                 ; re = e + Wt
-        mov     [W + 4*round],x2        ; Store in W buffer for future use
-        
-        mov     x2,r&ra                 ; x2 = a
-        rol     x2,5                    ; x2 = ROL(a,5)
-        add     r&re,x2                 ; re = e + Wt + ROL(a,5)
-        
-        mov     x2,r&rd                 ; x2 = d
-        xor     x2,r&rc                 ; x2 = (d ^ c)
-        and     x2,r&rb                 ; x2 = ((d ^ c) & b)
-        ror     r&rb,2                  ; rb = ROL( b, 30 )
-        xor     x2,r&rd                 ; x2 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        lea     r&re,[r&re+x2+K0_19]    ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-                
-        ENDM
-
-ROUND_CH        MACRO   round, ra, rb, rc, rd, re, x1, x2
-        ;
-        ; See ROUND_CH_0_15 for most parameters.
-        ; x1 and x2 are both scratch registers
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        xor     x1,r&rc                         ; x1 = (d ^ c)
-        and     x1,r&rb                         ; x1 = ((d ^ c) & b)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] 
-        xor     x1,r&rd                         ; x1 = ((d ^ c) & b) ^ d = CH(b,c,d)
-        rol     x2,1                            ; x2 = Wt
-        mov     [W+4*((round-16) MOD 16)],x2    ; 
-        add     r&re,x2                         ; re = e + ROL(a,5) + Wt
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        lea     r&re,[r&re+x1+K0_19]            ; re = e + Wt + ROL(a,5) + Ch(b,c,d) + Kt
-        ENDM
-
-ROUND_PARITY    MACRO   round, ra, rb, rc, rd, re, x1, x2, K, store
-        ;
-        ; See ROUND_CH for most parameters
-        ; K is the round constant to use.
-        ; store is 1 if the Wt value should be stored, 0 otherwise
-        ;  (used to avoid stores in the last few rounds)
-        ;
-        ; The order of xorring the registers b, c, and d is driven by the data dependency graph.
-        ; We start with d (the oldest) and then do b to unblock the subsequent rotate
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        xor     x1,r&rb                         ; x1 = (d ^ b)
-        xor     x1,r&rc                         ; x1 = (d ^ b ^ c) = Parity(b,c,d)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3]
-        rol     x2,1                            ; x2 = Wt
-        add     r&re,x1                         ; re = e + ROL(a,5) + Parity(b,c,d)
-        IF      store
-                mov     [W+4*((round-16) MOD 16)],x2    ; 
-        ENDIF
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        lea     r&re,[r&re+x2+K]                ; re = e + ROL(a,5) + Parity(b,c,d) + Wt + Kt
-
-                ENDM
-
-ROUND_MAJ       MACRO   round, ra, rb, rc, rd, re, x1, x2
-        ;
-        ; See above for parameter explanation
-        ;
-        mov     x2,[W+4*((round-16) MOD 16)]    ; x2 = W[t-16]
-        mov     x1,r&ra                         ; x1 = a
-        rol     x1,5                            ; x1 = ROL(a,5)
-        xor     x2,[W+4*((round-14) MOD 16)]    ; x2 = W[t-16] ^ W[t-14]
-        add     r&re,x1                         ; re = e + ROL(a,5)
-        mov     x1,r&rd                         ; x1 = d
-        xor     x2,[W+4*((round- 8) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8]
-        or      x1,r&rc                         ; x1 = (d | c)
-        and     x1,r&rb                         ; x1 = ((d | c) & b)
-        xor     x2,[W+4*((round- 3) MOD 16)]    ; x2 = W[t-16] ^ W[t-14] ^ W[t-8] ^ W[t-3] = Wt
-        rol     x2,1                            ; x2 = Wt
-        add     r&re,x2                         ; re = e + ROL(a,5) + Wt
-        mov     [W+4*((round-16) MOD 16)],x2    ; 
-
-        mov     x2,r&rc                         ; x2 = c
-        and     x2,r&rd                         ; x2 = (c & d)
-        or      x1,x2                           ; x1 = ((d | c) & b) | (d & c) = MAJ(b,c,d)
-        
-        ror     r&rb,2                          ; rb = ROL( b, 30 )
-        
-        lea     r&re,[r&re+x1+K40_59]           ; re = e + ROL(a,5) + Wt + Maj(b,c,d) + Kt
-        ENDM
-
-        ;
-        ; With these macros we can now produce the actual code.
-        ; Note the use of the % operator which evaluates the expression and yields the result as text.
-        ; Together with the macros and the r<i> EQUs this provides us with automatic register renaming
-        ; for each round.
-        ;
-        FOR     t, <0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
-                ROUND_CH_0_15   t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-        
-        ;
-        ; For the rest of the computation we need the extra register, so we update the data pointer and store it.
-        ;
-        add     ebp,64
-        mov     [esp+SymCryptSha1AppendBlocksFrame.pbData], ebp
-        
-        FOR     t, <16, 17, 18, 19>
-                ROUND_CH        t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-        
-        FOR     t, <20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K20_39, 1
-        ENDM
-
-        FOR     t, <40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59>
-                ROUND_MAJ       t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2
-        ENDM
-
-        FOR     t, <60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 1
-        ENDM
-        
-        ;
-        ; The last three rounds do not need to store their Wt in the W buffer as that value will never get used.
-        ;
-        FOR     t, <77, 78, 79>
-                ROUND_PARITY    t, %(t MOD 5), %((t + 4) MOD 5), %((t + 3) MOD 5), %((t + 2) MOD 5), %((t + 1) MOD 5), x1, x2, K60_79, 0
-        ENDM
-        
-        ;
-        ; Now we update the state
-        ;
-        mov     x2,[esp+SymCryptSha1AppendBlocksFrame.Hptr]
-        add     r0,[x2   ]
-        add     r4,[x2+ 4]
-        add     r3,[x2+ 8]
-        add     r2,[x2+12]
-        add     r1,[x2+16]
-        
-        mov     [x2   ], r0
-        mov     [x2+ 4], r4
-        mov     [x2+ 8], r3
-        mov     [x2+12], r2
-        mov     [x2+16], r1
-
-        ;
-        ; See if we have more data to process, and load the data pointer register again
-        ;
-        dec     [esp+SymCryptSha1AppendBlocksFrame.BlockCount]
-        mov     ebp, [esp+SymCryptSha1AppendBlocksFrame.pbData]
-        jnz     SymCryptSha1AppendBlocksLoop
-        
-        ;
-        ; We're done processing the blocks. The result is already in the state, so all we have to do
-        ; is clean up.
-        ;
-        ; Wipe the W buffer
-        ; The @@: label is an anonymous label. You can refer to the previous one using @B, which is easy to read.
-        ;
-        mov     ecx,8
-        xor     eax,eax
-@@:     dec     ecx
-        mov     [esp+8*ecx],eax
-        mov     [esp+8*ecx+4],eax
-        jnz     @B
-        
-SymCryptSha1AppendBlocksDone:   
-        ;
-        ; Restore non-volatile regisers & stackpointer
-        ;
-        mov     ebp,[esp+SymCryptSha1AppendBlocksFrame.SaveEbp]
-        mov     edi,[esp+SymCryptSha1AppendBlocksFrame.SaveEdi]
-        mov     esi,[esp+SymCryptSha1AppendBlocksFrame.SaveEsi]
-        mov     ebx,[esp+SymCryptSha1AppendBlocksFrame.SaveEbx]
-        add     esp,SymCryptSha1AppendBlocksFrame.ReturnAddress
-        
-        ret     4
-        
-@SymCryptSha1AppendBlocksAsm@12    ENDP
-_TEXT           ENDS
-
-END
-
diff --git a/lib/libmain.c b/lib/libmain.c
index c8e050d..809b664 100644
--- a/lib/libmain.c
+++ b/lib/libmain.c
@@ -7,9 +7,7 @@
 
 #include "precomp.h"
 
-#define EQU =
 #include "C_asm_shared.inc"
-#undef EQU
 
 #include "buildInfo.h"
 
@@ -34,16 +32,16 @@ SymCryptLibraryWasNotInitialized()
 
 #endif
 
-const CHAR * SymCryptBuildString = 
-        "v" SYMCRYPT_BUILD_INFO_VERSION 
-        "_" SYMCRYPT_BUILD_INFO_BRANCH 
+const CHAR * SymCryptBuildString =
+        "v" SYMCRYPT_BUILD_INFO_VERSION
+        "_" SYMCRYPT_BUILD_INFO_BRANCH
         "_" SYMCRYPT_BUILD_INFO_COMMIT
         "_" SYMCRYPT_BUILD_INFO_TIMESTAMP;
 
 VOID
 SYMCRYPT_CALL
 SymCryptInitEnvCommon( UINT32 version )
-// Returns TRUE if the initializatoin steps have to be performed.
+// Returns TRUE if the initialization steps have to be performed.
 {
     UINT32 tmp;
 
diff --git a/lib/linux/asmstubs.c b/lib/linux/asmstubs.c
deleted file mode 100644
index d1fdbc3..0000000
--- a/lib/linux/asmstubs.c
+++ /dev/null
@@ -1,223 +0,0 @@
-//
-// asmstubs.c
-// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM
-//
-// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-//
-
-#include "../precomp.h"
-
-extern const SYMCRYPT_BLOCKCIPHER SymCryptAesBlockCipherNoOpt;
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesEncryptAsm(
-    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _In_reads_(SYMCRYPT_AES_BLOCK_SIZE)     PCBYTE                      pbSrc,
-    _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE)   PBYTE                       pbDst )
-{
-    SymCryptAesEncryptC( pExpandedKey, pbSrc, pbDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesDecryptAsm(
-    _In_                                    PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _In_reads_(SYMCRYPT_AES_BLOCK_SIZE)     PCBYTE                      pbSrc,
-    _Out_writes_(SYMCRYPT_AES_BLOCK_SIZE)   PBYTE                       pbDst )
-{
-    SymCryptAesDecryptC( pExpandedKey, pbSrc, pbDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCbcEncryptAsm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SymCryptCbcEncrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCbcDecryptAsm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SymCryptCbcDecrypt( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptAesCtrMsb64Asm(
-    _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
-    _Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE )  PBYTE                       pbChainingValue,
-    _In_reads_( cbData )                        PCBYTE                      pbSrc,
-    _Out_writes_( cbData )                      PBYTE                       pbDst,
-                                                SIZE_T                      cbData )
-{
-    SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
-    SymCryptCtrMsb64( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
-{
-    volatile BYTE * p = (volatile BYTE *) pbData;
-    SIZE_T i;
-
-    for( i=0; i<cbData; i++ ){
-        p[i] = 0;
-    }
-
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyC(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyAsm(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask )
-{
-    SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulC(
-    _In_reads_(nWords1)             PCUINT32    pSrc1,
-                                    UINT32      nDigits1,
-    _In_reads_(nWords2)             PCUINT32    pSrc2,
-                                    UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2) PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulMulx(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst )
-{
-    SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulAsm(
-    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst )
-{
-    SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareC(
-    _In_reads_(nWords)              PCUINT32    pSrc,
-                                    UINT32      nDigits,
-    _Out_writes_(2*nWords)          PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareMulx(
-    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)         PCUINT32    pSrc,
-                                                            UINT32      nDigits,
-    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PUINT32     pDst )
-{
-    SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareAsm(
-    _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst )
-{
-    SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
-}
-
-VOID
-SymCryptFdefMontgomeryReduceC(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMontgomeryReduceMulx(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst )
-{
-    SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMontgomeryReduceAsm(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst )
-{
-    SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
-}
\ No newline at end of file
diff --git a/lib/makefile.inc b/lib/makefile.inc
new file mode 100644
index 0000000..2ea3c29
--- /dev/null
+++ b/lib/makefile.inc
@@ -0,0 +1,12 @@
+.SUFFIXES: .symcryptasm .cppasm
+
+# We still have architecture-specific inference rules because otherwise we cannot do any architecture-specific preprocessing
+
+# Preprocess amd64 .symcryptasm into masm
+{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
+    ..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
+    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
+
+# Preprocess x86 .cppasm into masm
+{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
+    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
diff --git a/lib/sc_lib.h b/lib/sc_lib.h
index d088b84..cb51e97 100644
--- a/lib/sc_lib.h
+++ b/lib/sc_lib.h
@@ -2212,11 +2212,11 @@ SymCryptFdefModElementToIntGeneric(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawSetValue(
-    _In_reads_bytes_(cbSrc)     PCBYTE                  pbSrc,
-                                SIZE_T                  cbSrc,
-                                SYMCRYPT_NUMBER_FORMAT  format,
-    _Out_writes_(nWords)        PUINT32                 pDst,
-                                UINT32                  nWords );
+    _In_reads_bytes_(cbSrc)                             PCBYTE                  pbSrc,
+                                                        SIZE_T                  cbSrc,
+                                                        SYMCRYPT_NUMBER_FORMAT  format,
+    _Out_writes_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32                 pDst,
+                                                        UINT32                  nDigits );
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
@@ -2250,11 +2250,11 @@ SymCryptFdefModElementSetValueNegUint32(
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
 SymCryptFdefRawGetValue(
-    _In_reads_(nWords)          PCUINT32                pSrc,
-                                UINT32                  nWords,
-    _Out_writes_bytes_(cbBytes) PBYTE                   pbDst,
-                                SIZE_T                  cbDst,
-                                SYMCRYPT_NUMBER_FORMAT  format );
+    _In_reads_(nDigits * SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32                pSrc,
+                                                        UINT32                  nDigits,
+    _Out_writes_bytes_(cbBytes)                         PBYTE                   pbDst,
+                                                        SIZE_T                  cbDst,
+                                                        SYMCRYPT_NUMBER_FORMAT  format );
 
 SYMCRYPT_ERROR
 SYMCRYPT_CALL
@@ -2492,14 +2492,6 @@ SymCryptFdefRawSubUint32(
     _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
                                                             UINT32      nDigits );
 
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawMaskedAddUint32(
-    _Inout_updates_( nWords )   PUINT32     pAcc,
-    _In_reads_( nWords )        PCUINT32    pSrc,
-                                UINT32      mask,
-                                UINT32      nWords );
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdefModMulGeneric(
@@ -2530,16 +2522,6 @@ SymCryptFdefModMulMontgomery256Asm(
     _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
                                     SIZE_T                  cbScratch );
 
-VOID
-SYMCRYPT_CALL
-SymCryptFdefModMulMontgomery256Test(
-    _In_                            PCSYMCRYPT_MODULUS      pMod,
-    _In_                            PCSYMCRYPT_MODELEMENT   pSrc1,
-    _In_                            PCSYMCRYPT_MODELEMENT   pSrc2,
-    _Out_                           PSYMCRYPT_MODELEMENT    pDst,
-    _Out_writes_bytes_( cbScratch ) PBYTE                   pbScratch,
-                                    SIZE_T                  cbScratch );
-
 VOID
 SYMCRYPT_CALL
 SymCryptFdef369ModMulMontgomery(
@@ -2684,11 +2666,11 @@ SymCryptFdefRawMul(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulMulx(
-    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
@@ -2696,7 +2678,7 @@ SymCryptFdefRawMulMulx1024(
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                         UINT32      nDigits,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
@@ -2732,7 +2714,7 @@ UINT32
 SYMCRYPT_CALL
 SymCryptFdefRawIsEqualUint32(
     _In_    PCUINT32        pSrc1,
-            UINT32          nWords,
+            UINT32          nDigits,
     _In_    UINT32          u32Src2 );
 
 UINT32
@@ -2909,27 +2891,27 @@ SymCryptFdef369MaskedCopyAsm(
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawMulAsm(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquareAsm(
-    _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
+    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                         UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdef369RawMulAsm(
-    _In_reads_(nDgigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst );
+    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+                                                                    UINT32      nDigits1,
+    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+                                                                    UINT32      nDigits2,
+    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
@@ -2937,14 +2919,14 @@ SymCryptFdefRawMul512Asm(
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                         UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquare512Asm(
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                         UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
@@ -2952,69 +2934,69 @@ SymCryptFdefRawMul1024Asm(
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc1,
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc2,
                                                         UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefRawSquare1024Asm(
     _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
                                                         UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst );
+    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceAsm(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce256Asm(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce512Asm(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduce1024Asm(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdef369MontgomeryReduce(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdef369MontgomeryReduceAsm(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceMulx(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 VOID
 SYMCRYPT_CALL
 SymCryptFdefMontgomeryReduceMulx1024(
     _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
+    _Inout_                         PUINT32                 pSrc,
     _Out_                           PUINT32                 pDst );
 
 // Helper macro for checking for specific key validation flag using bits 4 and 5 in a flags variable
diff --git a/lib/sources b/lib/sources
index 4c709ec..dff090e 100644
--- a/lib/sources
+++ b/lib/sources
@@ -13,14 +13,29 @@ ARM64X_EC_ENABLED=1
 TARGETNAME = symcrypt
 TARGETTYPE=LIBRARY
 
-KM_LIBRARY = 1		# enable /kernel flag & epilogue metadata
-GUARD = 1		# enable CFG
+KM_LIBRARY = 1  # enable /kernel flag & epilogue metadata
+GUARD = 1       # enable CFG
 ENABLE_ASM_RETPOLINE = 1
 ENABLE_RETPOLINE_LINKER_WARNING = 1
 
 # Enable /Gy for all assembler code
 ASM_DEFINES=$(ASM_DEFINES) /Gy
 
+USE_MAKEFILE_INC = 1
+
+# Explicitly call out that we must preprocess symcryptasm files
+# Make the target paths be architecture specific to get nmake to pick the right inference rule
+NTTARGETFILE0=\
+!IF "$(_BUILDARCH)" == "amd64"
+    $(OBJ_PATH)\$(O)\..\amd64\fdef_asm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\wipe.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\aesasm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\fdef369_asm.asm \
+    $(OBJ_PATH)\$(O)\..\amd64\fdef_mulx.asm \
+!ELSEIF "$(_BUILDARCH)" == "x86"
+    $(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
+!ENDIF
+
 INCLUDES=  \
     ..\inc; \
     $(DS_INC_PATH)\crypto; \
@@ -137,7 +152,6 @@ SOURCES= \
     scsTools.c \
 
 AMD64_SOURCES = \
-#    sha1asm.asm \
     wipe.asm \
     aesasm.asm \
     fdef_asm.asm \
@@ -145,10 +159,8 @@ AMD64_SOURCES = \
     fdef_mulx.asm \
 
 I386_SOURCES = \
-#    sha1asm.asm \
     aesasm.asm \
     wipe.asm \
-#    rc4asm.asm \
     fdef_asm.asm \
 
 ARM_SOURCES = \
diff --git a/lib/symcryptasm_shared.cppasm b/lib/symcryptasm_shared.cppasm
new file mode 100644
index 0000000..7b53eb8
--- /dev/null
+++ b/lib/symcryptasm_shared.cppasm
@@ -0,0 +1,36 @@
+//
+//  symcryptasm_shared.cppasm   Shared definitions used by the C preprocessor step in symcryptasm
+//  processing. See scripts/symcryptasm_processor.py for more details.
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#if defined(SYMCRYPT_MASM)
+
+#if defined(SYMCRYPT_CPU_AMD64)
+include ksamd64.inc
+#endif
+
+#include "C_asm_shared.inc"
+
+#define FILE_END()                      END
+#define ALIGN(__alignment)              align __alignment
+#define GET_SYMBOL_ADDRESS(__symbol)    __symbol
+#define HEX(__constant)                 __constant##h
+
+#elif defined(SYMCRYPT_GAS)
+
+.intel_syntax noprefix
+
+#include "C_asm_shared.inc"
+
+#define FILE_END()
+#define ALIGN(__alignment)              .align __alignment
+#define GET_SYMBOL_ADDRESS(__symbol)    __symbol+rip
+#define HEX(__constant)                 0x##__constant
+
+#else
+
+#error Unknown target assembly
+
+#endif
diff --git a/scripts/symcryptasm_processor.py b/scripts/symcryptasm_processor.py
new file mode 100644
index 0000000..1f45c55
--- /dev/null
+++ b/scripts/symcryptasm_processor.py
@@ -0,0 +1,657 @@
+#!/usr/bin/env python3
+"""
+This script enables processing of symcryptasm files so that they can be assembled in a variety of
+environments without requiring forking or duplication of source files - symcryptasm files phrase
+assembly in an assembler and environment agnostic way.
+
+The current target assemblers are:
+    MASM and GAS
+The current target environments are:
+    amd64 Windows (using the Microsoft x64 calling convention), and
+    amd64 Linux (using the SystemV amd64 calling convention)
+
+Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
+rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
+this effort.
+
+The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
+symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
+The .cppasm files are further processed by the C preprocessor to do more simple stateless text
+substitutions, outputting a .asm file which can be assembled by thetarget assembler for the target
+environment.
+
+We have set up the intermediate generated files to be created in the output directories in both
+razzle and CMake builds.
+
+### symcryptasm syntax ###
+
+Different calling conventions pass arguments to functions in different registers, have differing
+numbers of volatile and non-volatile registers, and use the stack in different ways.
+
+We define our own register naming scheme which abstracts away the differences between calling
+conventions. The generalities of the naming scheme will be similar across target architectures, but
+refer to the Architecture specifics below for details. For the following general information we use
+the notation R<n> to denote registers in the symcryptasm register naming scheme.
+
+
+A leaf function (a function which does not call another function) begins with an invocation of the
+FUNCTION_START macro which currently takes 3 arguments:
+1) The function name
+    This must be the name that matches the corresponding declaration of the function
+2) The number of arguments (arg_count) that the function takes
+    These arguments will be accessible in some contiguous region of the symcrypt registers at the
+    start of the function
+        On amd64 this contiguous region is R1..R<arg_count>
+    Note: arg_count need not correspond to the exact number of argument in the function declaration
+    if the assembly does not use some tail of the arguments
+3) The number of registers (reg_count) that the function uses
+    These registers will be accessible as R0..R<reg_count-1>
+
+A leaf function ends with the FUNCTION_END macro, which also takes the function name
+    (a FUNCTION_END macro's function name must match the preceding FUNCTION_START's name)
+
+At the function start a prologue is generated which arranges the arguments appropriately in
+registers, and saves non-volatile registers that have been requested to be used.
+At the function end an epilogue is generated with restores the non-volatile registers and returns.
+
+
+A nested function (a function which does call another function) is specified similarly, only using
+NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and align
+the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
+
+
+A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
+number of macros argument names. It ends with MACRO_END.
+
+### Architecture specifics ###
+
+### amd64 ###
+We allow up to 15 registers to be addressed, with the names:
+Q0-Q15 (64-bit registers), W0-W15 (32-bit registers), H0-H15 (16-bit registers), and B0-B15 (8-bit
+registers)
+Xmm0-Xmm5 registers may be used directly in assembly too, as in both amd64 calling conventions we
+currently support, these registers are volatile so do not need any special handling
+
+On function entry we insert a prologue which ensures:
+Q0 is the result register (the return value of the function, and the low half of a multiplication)
+Q1-Q6 are the first 6 arguments passed to the function
+
+Additionally, there is a special case for functions using mul or mulx instructions, as these
+instructions make rdx a special register. Functions using these instructions may address Q0-Q14,
+and QH. As rdx is used to pass arguments, its value is moved to another register in the function
+prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
+    We currently do not support nested mul functions, as we have none of them.
+
+"""
+
+import re
+import types
+import logging
+
+class Register:
+    """A class to represent registers"""
+
+    def __init__(self, name64, name32, name16, name8):
+        self.name64 = name64
+        self.name32 = name32
+        self.name16 = name16
+        self.name8  = name8
+
+# amd64 registers
+REG_RAX = Register("rax",  "eax",   "ax",   "al")
+REG_RBX = Register("rbx",  "ebx",   "bx",   "bl")
+REG_RCX = Register("rcx",  "ecx",   "cx",   "cl")
+REG_RDX = Register("rdx",  "edx",   "dx",   "dl")
+REG_RSI = Register("rsi",  "esi",   "si",  "sil")
+REG_RDI = Register("rdi",  "edi",   "di",  "dil")
+REG_RSP = Register("rsp",  "esp",   "sp",  "spl")
+REG_RBP = Register("rbp",  "ebp",   "bp",  "bpl")
+REG_R8  = Register( "r8",  "r8d",  "r8w",  "r8b")
+REG_R9  = Register( "r9",  "r9d",  "r9w",  "r9b")
+REG_R10 = Register("r10", "r10d", "r10w", "r10b")
+REG_R11 = Register("r11", "r11d", "r11w", "r11b")
+REG_R12 = Register("r12", "r12d", "r12w", "r12b")
+REG_R13 = Register("r13", "r13d", "r13w", "r13b")
+REG_R14 = Register("r14", "r14d", "r14w", "r14b")
+REG_R15 = Register("r15", "r15d", "r15w", "r15b")
+
+class CallingConvention:
+    """A class to represent calling conventions"""
+
+    def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
+        self.name = name
+        self.architecture = architecture
+        self.mapping = mapping
+        self.argument_registers = argument_registers
+        self.volatile_registers = volatile_registers
+        self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
+        self.gen_epilogue_fn = types.MethodType(gen_epilogue_fn, self)
+        self.gen_get_memslot_offset_fn = types.MethodType(gen_get_memslot_offset_fn, self)
+
+
+def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
+    """Gets the register mapping used in functions requiring special rdx handling.
+
+    In amd64, when using mul and mulx, rdx is a special register.
+    rdx is also used for passing arguments in both Msft and System V calling conventions.
+    In asm functions that use mul or mulx, we will explicitly move the argument passed in
+    rdx to a different volatile register in the function prologue, and in the function body
+    we refer to rdx using (Q|D|W|B)H.
+    """
+    rdx_index = None
+    return_mapping = { 'H': REG_RDX }
+    for (index, register) in mapping.items():
+        if register == REG_RDX:
+            rdx_index = index
+            break
+    for (index, register) in mapping.items():
+        # preserve argument registers
+        if (index <= argument_registers) and (index != rdx_index):
+            return_mapping[index] = register
+        # replace rdx with the first non-argument register
+        if index == argument_registers+1:
+            return_mapping[rdx_index] = register
+        # shuffle all later registers down to fill the gap
+        if index > argument_registers+1:
+            return_mapping[index-1] = register
+    return return_mapping
+
+# Calling convention constants
+
+MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
+MAX_FUNCTION_REGISTER_COUNT = 15
+
+# Microsoft x64 calling convention
+MAPPING_AMD64_MSFT = {
+    0: REG_RAX, # Result register
+    1: REG_RCX, # Argument 1 / volatile
+    2: REG_RDX, # Argument 2 / volatile
+    3: REG_R8,  # Argument 3 / volatile
+    4: REG_R9,  # Argument 4 / volatile
+    5: REG_R10, # volatile
+    6: REG_R11, # volatile
+    7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
+    8: REG_RDI,
+    9: REG_RBP,
+    10:REG_RBX,
+    11:REG_R12,
+    12:REG_R13,
+    13:REG_R14,
+    14:REG_R15,
+    # currently not mapping rsp
+}
+
+def calc_amd64_shadow_space_allocation_size(self, reg_count):
+    # If we are a nested function, we must allocate 32B of shadow space on the stack, and ensure the
+    # stack pointer is aligned to 16B
+    # Before the prologue we have rsp % 16 == 8 - as the call pushed an 8B return address on an
+    # aligned stack
+    alignment = 8
+    # We then pushed some number of additional 8B registers onto the stack
+    if reg_count > self.volatile_registers:
+        alignment = (alignment + (8 * (self.volatile_registers - reg_count))) % 16
+    shadow_space_allocation_size = 32
+    if alignment == 8:
+        # possibly allocate 8 more bytes to align the stack to 16B
+        shadow_space_allocation_size += 8
+    return shadow_space_allocation_size
+
+def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=False):
+    prologue = "\n"
+    if reg_count > self.volatile_registers:
+        prologue += "rex_push_reg Q%s\n" % self.volatile_registers
+        for i in range(self.volatile_registers+1, reg_count):
+            prologue += "push_reg Q%s\n" % i
+        prologue += "\nEND_PROLOGUE\n\n"
+
+    shadow_space_allocation_size = 0
+
+    if nested:
+        shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        prologue += "sub rsp, %d // allocate shadow space and align stack\n\n" % shadow_space_allocation_size
+
+    prologue += mul_fixup
+
+    # put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
+    # stack_offset to get the 5th argument is:
+    # 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
+    stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
+    for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
+        prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
+        stack_offset += 8
+    return prologue
+
+def gen_prologue_amd64_msft_mul(self, arg_count, reg_count):
+    return gen_prologue_amd64_msft(self, arg_count, reg_count, "mov Q2, QH\n")
+
+def gen_prologue_amd64_msft_nested(self, arg_count, reg_count):
+    return gen_prologue_amd64_msft(self, arg_count, reg_count, "", nested=True)
+
+def gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=False):
+    epilogue = ""
+
+    if nested:
+        shadow_space_allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        epilogue += "add rsp, %d // deallocate shadow space and align stack\n\n" % shadow_space_allocation_size
+
+    if reg_count > self.volatile_registers:
+        epilogue += "BEGIN_EPILOGUE\n"
+        for i in reversed(range(self.volatile_registers, reg_count)):
+            epilogue += "pop Q%s\n" % i
+    epilogue += "ret\n"
+    return epilogue
+
+def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
+    return gen_epilogue_amd64_msft(self, arg_count, reg_count, nested=True)
+
+def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
+    # only support 4 memory slots for now (in shadow space)
+    if(slot >= 4):
+        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        exit(1)
+    # 8B for return address + (8*#pushed registers in prologue)
+    stack_offset = 8 + (8*(reg_count-self.volatile_registers))
+    if nested:
+        stack_offset += calc_amd64_shadow_space_allocation_size(self, reg_count)
+    return "%d /*MEMSLOT%d*/" % (stack_offset+(8*slot), slot)
+
+def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
+    return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
+
+CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
+CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
+    "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
+    gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
+CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
+
+# AMD64 System V calling convention
+MAPPING_AMD64_SYSTEMV = {
+    0: REG_RAX, # Result register
+    1: REG_RDI, # Argument 1 / volatile
+    2: REG_RSI, # Argument 2 / volatile
+    3: REG_RDX, # Argument 3 / volatile
+    4: REG_RCX, # Argument 4 / volatile
+    5: REG_R8,  # Argument 5 / volatile
+    6: REG_R9,  # Argument 6 / volatile
+    7: REG_R10, # volatile
+    8: REG_R11, # volatile
+    9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
+    10:REG_RBP,
+    11:REG_R12,
+    12:REG_R13,
+    13:REG_R14,
+    14:REG_R15
+    # currently not mapping rsp
+}
+
+def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=False):
+    # push volatile registers onto the stack
+    prologue = "\n"
+    if reg_count > self.volatile_registers:
+        for i in range(self.volatile_registers, reg_count):
+            prologue += "push Q%s\n" % i
+
+    # If we are a nested function, we need to align the stack to 16B, and allocate space for up to 4
+    # memory slots not in the redzone. We can use the same logic as on the MSFT x64 side to allocate
+    # our own space for 32B of local variables (whereas on the MSFT side, we use this for allocating
+    # space for a function we are about to call)
+    if nested:
+        allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        prologue += "sub rsp, %d // allocate memslot space and align stack\n\n" % allocation_size
+
+    prologue += mul_fixup
+
+    # do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
+    # # put additional arguments into Q7-Qn
+    # # stack_offset to get the 7th argument is:
+    # # 8B for return address
+    # stack_offset = 8
+    # for i in range(self.argument_registers+1, arg_count+1):
+    #     prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
+    #     stack_offset += 8
+
+    return prologue
+
+def gen_prologue_amd64_systemv_mul(self, arg_count, reg_count):
+    return gen_prologue_amd64_systemv(self, arg_count, reg_count, "mov Q3, QH\n")
+
+def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count):
+    return gen_prologue_amd64_systemv(self, arg_count, reg_count, "", nested=True)
+
+def gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=False):
+    epilogue = ""
+
+    if nested:
+        allocation_size = calc_amd64_shadow_space_allocation_size(self, reg_count)
+        epilogue += "add rsp, %d // deallocate memslot space and align stack\n\n" % allocation_size
+
+    if reg_count > self.volatile_registers:
+        for i in reversed(range(self.volatile_registers, reg_count)):
+            epilogue += "pop Q%s\n" % i
+    epilogue += "ret\n"
+    return epilogue
+
+def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
+    return gen_epilogue_amd64_systemv(self, arg_count, reg_count, nested=True)
+
+def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
+    # only support 4 memory slots for now
+    if(slot >= 4):
+        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        exit(1)
+    # For leaf functions, use the top of the redzone below the stack pointer
+    offset = -8 * (slot+1)
+    if nested:
+        # For nested functions, use the 32B of memslot space above the stack pointer created in the prologue
+        offset = 8*slot
+    return "%d /*MEMSLOT%d*/" % (offset, slot)
+
+def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count):
+    return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
+
+CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
+CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
+    "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
+    gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
+CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
+
+
+def gen_function_start_defines(mapping, arg_count, reg_count):
+    defines = ""
+    for (index, reg) in mapping.items():
+        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
+            continue
+        defines += "#define Q%s %s\n" % (index, reg.name64)
+        defines += "#define D%s %s\n" % (index, reg.name32)
+        defines += "#define W%s %s\n" % (index, reg.name16)
+        defines += "#define B%s %s\n" % (index, reg.name8)
+    return defines
+
+def gen_function_end_defines(mapping, arg_count, reg_count):
+    undefs = ""
+    for (index, _) in mapping.items():
+        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
+            continue
+        undefs += "#undef Q%s\n" % (index)
+        undefs += "#undef D%s\n" % (index)
+        undefs += "#undef W%s\n" % (index)
+        undefs += "#undef B%s\n" % (index)
+    return undefs
+
+MASM_FRAMELESS_FUNCTION_ENTRY   = "LEAF_ENTRY %s, _TEXT\n"
+MASM_FRAMELESS_FUNCTION_END     = "LEAF_END %s, _TEXT\n"
+MASM_FRAME_FUNCTION_ENTRY       = "NESTED_ENTRY %s, _TEXT\n"
+MASM_FRAME_FUNCTION_END         = "NESTED_END %s, _TEXT\n"
+
+GAS_FUNCTION_ENTRY    = "%s: .global %s\n"
+GAS_FUNCTION_END      = ""
+
+def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
+    function_entry = None
+    if assembler == "masm":
+        # need to identify and mark up frame functions in masm
+        if nested or (reg_count > calling_convention.volatile_registers):
+            function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
+        else:
+            function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
+    elif assembler == "gas":
+        function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
+
+    prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
+    prologue += "%s" % (function_entry)
+    prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
+
+    return prologue
+
+def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
+    function_end = None
+    if assembler == "masm":
+        # need to identify and mark up frame functions in masm
+        if nested or (reg_count > calling_convention.volatile_registers):
+            function_end = MASM_FRAME_FUNCTION_END % (function_name)
+        else:
+            function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
+    elif assembler == "gas":
+        function_end = GAS_FUNCTION_END
+
+    epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
+    epilogue += "%s" % (function_end)
+    epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
+
+    return epilogue
+
+MASM_MACRO_START    = "%s MACRO %s\n"
+MASM_MACRO_END      = "ENDM\n"
+GAS_MACRO_START     = ".macro %s %s\n"
+GAS_MACRO_END       = ".endm\n"
+MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
+GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
+
+
+FUNCTION_START_PATTERN  = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
+FUNCTION_END_PATTERN    = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
+GET_MEMSLOT_PATTERN     = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
+ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
+MACRO_START_PATTERN     = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
+MACRO_END_PATTERN       = re.compile("\s*MACRO_END\s*\(\s*\)")
+
+class ProcessingStateMachine:
+    """A class to hold the state when processing a file and handle files line by line"""
+
+    def __init__(self, assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention):
+        self.assembler = assembler
+        self.normal_calling_convention = normal_calling_convention
+        self.mul_calling_convention = mul_calling_convention
+        self.nested_calling_convention = nested_calling_convention
+
+        self.function_start_match = None
+        self.function_start_line = 0
+        self.is_nested_function = None
+        self.is_mul_function = None
+        self.calling_convention = None
+        self.function_name = None
+        self.arg_count = None
+        self.reg_count = None
+
+        self.macro_start_match = None
+        self.macro_name = None
+        self.macro_args = None
+
+    def process_line(self, line, line_num):
+        if self.function_start_match == None and self.macro_start_match == None:
+            return self.process_normal_line(line, line_num)
+        elif self.function_start_match != None:
+            return self.process_function_line(line, line_num)
+        elif self.macro_start_match != None:
+            return self.process_macro_line(line, line_num)
+        else:
+            logging.error("Whoops, something is broken with the state machine (failed at line %d)" % line_num)
+            exit(1)
+
+    def process_normal_line(self, line, line_num):
+        # Not currently in a function or macro
+        match = FUNCTION_START_PATTERN.match(line)
+        if (match):
+            return self.process_start_function(match, line, line_num)
+
+        match = MACRO_START_PATTERN.match(line)
+        if (match):
+            return self.process_start_macro(match, line, line_num)
+
+        # Not starting a function or a macro
+        return line
+
+    def process_start_function(self, match, line, line_num):
+        # Entering a new function
+        self.function_start_match = match
+        self.function_start_line = line_num
+        self.is_nested_function = (match.group(1) == "NESTED_")
+        self.is_mul_function = (match.group(2) == "MUL_")
+        self.function_name = match.groups()[-3]
+        self.arg_count = int(match.groups()[-2])
+        self.reg_count = int(match.groups()[-1])
+
+        if self.is_nested_function and self.is_mul_function:
+            logging.error(
+                "Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
+                "%s (line %d)"
+                % (line, line_num))
+            exit(1)
+        if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
+            logging.error(
+                "Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
+                "%s (line %d)"
+                % (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
+            exit(1)
+        if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
+            logging.error(
+                "Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
+                "%s (line %d)"
+                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
+            exit(1)
+        if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
+            logging.error(
+                "Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
+                "%s (line %d)"
+                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
+            exit(1)
+
+        logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
+
+        if self.is_nested_function:
+            self.calling_convention = self.nested_calling_convention
+        elif self.is_mul_function:
+            self.calling_convention = self.mul_calling_convention
+        else:
+            self.calling_convention = self.normal_calling_convention
+
+        return generate_prologue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
+
+    def process_start_macro(self, match, line, line_num):
+        self.macro_start_match = match
+        self.macro_name = match.group(1)
+        self.macro_args = [ x.strip() for x in match.group(2).split(",") ]
+
+        logging.info("%d: macro start %s, %s" % (line_num, self.macro_name, self.macro_args))
+
+        if self.assembler == "masm":
+            return MASM_MACRO_START % (self.macro_name, match.group(2))
+        elif self.assembler == "gas":
+            return GAS_MACRO_START % (self.macro_name, match.group(2))
+
+    def process_function_line(self, line, line_num):
+        # Currently in a function
+
+        match = ALTERNATE_ENTRY_PATTERN.match(line)
+        if (match):
+            if self.assembler == "masm":
+                return MASM_ALTERNATE_ENTRY % match.group(1)
+            elif self.assembler == "gas":
+                return GAS_ALTERNATE_ENTRY % (match.group(1), match.group(1))
+
+        match = FUNCTION_END_PATTERN.match(line)
+        if (match):
+            # Check the end function has same prefix as previous start function
+            if  (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
+                (self.is_mul_function ^ (match.group(2) == "MUL_")):
+                logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                    % (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
+                exit(1)
+            # Check the end function pattern has the same label as the previous start function pattern
+            if self.function_name != match.groups()[-1]:
+                logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                    % (self.function_name, self.function_start_line, match.groups()[-1], line_num))
+                exit(1)
+
+            epilogue = generate_epilogue(self.assembler, self.calling_convention, self.function_name, self.arg_count, self.reg_count, self.is_nested_function)
+
+            logging.info("%d: function end %s" % (line_num, self.function_name))
+
+            self.function_start_match = None
+            self.function_start_line = 0
+            self.is_nested_function = None
+            self.is_mul_function = None
+            self.calling_convention = None
+            self.function_name = None
+            self.arg_count = None
+            self.reg_count = None
+
+            return epilogue
+
+        # replace any GET_MEMSLOT_OFFSET macros in line
+        match = GET_MEMSLOT_PATTERN.search(line)
+        while(match):
+            slot = int(match.group(1))
+            replacement = self.calling_convention.gen_get_memslot_offset_fn(slot, self.arg_count, self.reg_count)
+            line = GET_MEMSLOT_PATTERN.sub(replacement, line)
+            match = GET_MEMSLOT_PATTERN.search(line)
+
+            logging.info("%d: memslot macro %d" % (line_num, slot))
+
+        # Not modifying the line any further
+        return line
+
+    def process_macro_line(self, line, line_num):
+        # Currently in a macro
+        match = MACRO_END_PATTERN.match(line)
+        if (match):
+            logging.info("%d: macro end %s" % (line_num, self.macro_name))
+
+            self.macro_start_match = None
+            self.macro_name = None
+            self.macro_args = None
+
+            if self.assembler == "masm":
+                return MASM_MACRO_END
+            elif self.assembler == "gas":
+                return GAS_MACRO_END
+
+        if self.assembler == "gas":
+            # In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
+            for arg in self.macro_args:
+                line = re.sub(arg, r"\\%s" % arg, line)
+
+        # Not modifying the line any further
+        return line
+
+def process_file(target, infilename, outfilename):
+    assembler = None
+    if target == "masm":
+        assembler = "masm"
+        normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
+        mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
+        nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
+    elif target == "gas":
+        assembler = "gas"
+        normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
+        mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
+        nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
+
+    # iterate through file line by line in one pass
+    file_processing_state = ProcessingStateMachine(
+        assembler, normal_calling_convention, mul_calling_convention, nested_calling_convention)
+
+    with open(infilename) as infile:
+        with open(outfilename, "w") as outfile:
+            for line_num, line in enumerate(infile):
+                processed_line = file_processing_state.process_line(line, line_num)
+                outfile.write(processed_line)
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
+    parser.add_argument('target', type=str, help='Target that we want to preprocess for')
+    parser.add_argument('inputfile', type=str, help='Path to input file')
+    parser.add_argument('outputfile', type=str, help='Path to output file')
+
+    args = parser.parse_args()
+    process_file(args.target, args.inputfile, args.outputfile)
diff --git a/unittest/lib/main.cpp b/unittest/lib/main.cpp
index e9b46ec..2ade7d7 100644
--- a/unittest/lib/main.cpp
+++ b/unittest/lib/main.cpp
@@ -7,7 +7,6 @@
 
 #include "precomp.h"
 
-#define EQU =
 #include "C_asm_shared.inc"
 
 VOID