Merged PR 6438924: Enable SymCryptAsm for Arm64

+ Extends SymCryptAsm format and script to work in the Arm64 context + Now specify architecture, assembler, and calling convention in script invocation + Make various changes to assembly to remove redundant instructions, and generally slightly improve perf for all platforms (a couple of % here and there) + Use assembly routines in Linux builds and remove asmstubs file + Do not enable Windows Arm64 build with CMake yet Related work items: #35613721
2021-09-20 08:25:04 +00:00 · 2021-09-20 08:25:04 +00:00 · 2bc541799d
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -61,6 +61,9 @@ if(WIN32)
 else()
    if(NOT SYMCRYPT_TARGET_ENV MATCHES "Generic")
        enable_language(ASM)
+        # Suppress noisy warnings about compile options which are ignored for ASM
+        # Less messy than restricting most of the below options to only C/CXX!
+        add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-Wno-unused-command-line-argument>)
    endif()
    # add_compile_options(-Wall)
    # add_compile_options(-Wno-unknown-pragmas)
@ -76,6 +79,12 @@ else()
    # Avoids error: cast from pointer to smaller type 'uintptr_t' when including <memory> from aarch64-linux-gnu
    add_compile_options(-fms-extensions)

+    # GCC and clang unroll more aggressively than they should for best performance
+    # When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
+    # (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
+    # using GCC-specific pragmas for the loops of interest)
+    add_compile_options(-fno-unroll-loops)
+
    # In Sanitize version, enable sanitizers
    if (CMAKE_BUILD_TYPE MATCHES Sanitize)
        add_compile_options(-fsanitize=address)
@ -120,12 +129,6 @@ else()
        add_link_options(-fsanitize=vptr)
        add_link_options(-fno-sanitize-recover=all)
    endif()
-
-    # GCC and clang unroll more aggressively than they should for best performance
-    # When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
-    # (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
-    # using GCC-specific pragmas for the loops of interest)
-    add_compile_options(-fno-unroll-loops)
 endif()

 if(CMAKE_BUILD_TYPE MATCHES Release)
--- a/cmake-toolchain/LinuxUserMode-AMD64.cmake
+++ b/cmake-toolchain/LinuxUserMode-AMD64.cmake
@ -1,5 +1,5 @@
 # This toolchain file configures CMake options for Linux User Mode AMD64 compilation with CPU optimizations.
-# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-AMD64.cmake
+# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-AMD64.cmake"

 # Set CMake variables that subsequent CMake scripts can check against
 set(CMAKE_SYSTEM_NAME Linux)
--- a/cmake-toolchain/LinuxUserMode-ARM64.cmake
+++ b/cmake-toolchain/LinuxUserMode-ARM64.cmake
@ -1,5 +1,5 @@
 # This toolchain file configures CMake options for Linux User Mode ARM64 compilation with CPU optimizations.
-# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-ARM64.cmake
+# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-ARM64.cmake"

 # Set CMake variables that subsequent CMake scripts can check against
 set(CMAKE_SYSTEM_NAME Linux)
@ -8,13 +8,14 @@ set(CMAKE_SYSTEM_PROCESSOR ARM64)
 set(TARGET_TRIPLE aarch64-linux-gnu)

 # Currently only use clang as it makes cross-compilation easier
+set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
 set(CMAKE_C_COMPILER clang)
 set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
 set(CMAKE_CXX_COMPILER clang++)
 set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})

 # Point clang sysroot to cross compilation toolchain when cross compiling
-if(NOT CMAKE_HOST_SYSTEM_PROCESSOR EQUAL CMAKE_SYSTEM_PROCESSOR)
+if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES ARM64|aarch64)
    # C/C++ toolchain (installed on Ubuntu using apt-get gcc-aarch64-linux-gnu g++-aarch64-linux-gnu)
    set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})

--- a/cmake-toolchain/WindowsUserMode-AMD64.cmake
+++ b/cmake-toolchain/WindowsUserMode-AMD64.cmake
@ -1,5 +1,5 @@
 # This toolchain file configures CMake options for Windows User Mode AMD64 compilation with CPU optimizations.
-# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-AMD64.cmake
+# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-AMD64.cmake"

 # Set CMake variables that subsequent CMake scripts can check against
 set(CMAKE_SYSTEM_NAME Windows)
--- a/cmake-toolchain/WindowsUserMode-X86.cmake
+++ b/cmake-toolchain/WindowsUserMode-X86.cmake
@ -1,5 +1,5 @@
 # This toolchain file configures CMake options for Windows User Mode x86 compilation with CPU optimizations.
-# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-X86.cmake -A Win32
+# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-X86.cmake" -A Win32
 #
 # (The "-A Win32" option seems to be required when compiling on a 64-bit host. Ideally this toolchain file
 # should set all the required options, but I haven't figured out how to force 32-bit compilation from the
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -105,7 +105,7 @@ function(process_cppasm filepath outformat archdefine)
    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
        message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
    endif()
-    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
        message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
    endif()
    get_filename_component(rootpath ${filepath} DIRECTORY)
@ -151,14 +151,20 @@ function(process_cppasm filepath outformat archdefine)
    endif()
 endfunction()

-function(process_symcryptasm filepath outformat archdefine)
+function(process_symcryptasm filepath outformat archdefine callingconvention)
    get_filename_component(fileextension ${filepath} EXT)
    if(NOT fileextension STREQUAL .symcryptasm)
        message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
    endif()
-    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
+    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
    endif()
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
+        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
+    endif()
+    if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
+        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
+    endif()
    get_filename_component(rootpath ${filepath} DIRECTORY)
    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
    set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
@ -168,7 +174,7 @@ function(process_symcryptasm filepath outformat archdefine)
    add_custom_command(
        OUTPUT ${output_cppasm}
        COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
-        COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
+        COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${archdefine} ${callingconvention} ${filepath} ${output_cppasm}
        MAIN_DEPENDENCY ${filepath}
        DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
        COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
@ -183,19 +189,15 @@ else()
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
        list(APPEND SOURCES_COMMON linux/intrinsics.c)
    endif()
-
-    if(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
-        list(APPEND SOURCES_COMMON linux/asmstubs.c)
-    endif()
 endif()

 if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
-        process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
-        process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
-        process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
-        process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
-        process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
+        process_symcryptasm(amd64/aesasm.symcryptasm masm amd64 msft)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64 msft)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64 msft)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64 msft)
+        process_symcryptasm(amd64/wipe.symcryptasm masm amd64 msft)

        list(APPEND SOURCES_COMMON
            amd64/aesasm-masm.asm
@ -229,11 +231,11 @@ if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
    endif()
 elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
-        process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
-        process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
-        process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
-        process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
-        process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
+        process_symcryptasm(amd64/aesasm.symcryptasm gas amd64 systemv)
+        process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64 systemv)
+        process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64 systemv)
+        process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64 systemv)
+        process_symcryptasm(amd64/wipe.symcryptasm gas amd64 systemv)

        list(APPEND SOURCES_COMMON
            amd64/aesasm-gas.asm
@ -248,6 +250,20 @@ elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
            amd64/fdef_mulx-gas.asm
            amd64/wipe-gas.asm
            PROPERTY LANGUAGE ASM)
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
+        process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
+        process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
+        process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
+
+        list(APPEND SOURCES_COMMON
+            arm64/fdef_asm-gas.asm
+            arm64/fdef369_asm-gas.asm
+            arm64/wipe-gas.asm)
+        set_source_files_properties(
+            arm64/fdef_asm-gas.asm
+            arm64/fdef369_asm-gas.asm
+            arm64/wipe-gas.asm
+            PROPERTY LANGUAGE ASM)
    endif()
 endif()

--- a/lib/a_dispatch.c
+++ b/lib/a_dispatch.c
@ -31,7 +31,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
    SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY1024,         // Special faster code for 1024-bit Montgomery moduli
    SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY_MULX1024,    // Special faster code for 1024-bit Montgomery moduli, MULX-based code

-#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
+#elif SYMCRYPT_CPU_ARM64

    SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY,
    {NULL,},
@ -68,7 +68,7 @@ const SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
    {('xM' << 16) + SymCryptModFntableMontgomeryMulx,       SYMCRYPT_CPU_FEATURES_FOR_MULX,    0,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
    {('1M' << 16) + SymCryptModFntableMontgomery1024,       0,                              1024,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },

-#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
+#elif SYMCRYPT_CPU_ARM64

    {('mM' << 16) + SymCryptModFntableMontgomery,           0,                               256,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
    {('9M' << 16) + SymCryptModFntable369Montgomery,        0,                               384,    SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
--- a/lib/arm64/fdef369_asm.asm
+++ b/lib/arm64/fdef369_asm.asm
@ -1,472 +0,0 @@
-;
-;  fdef_369asm.asm   Assembler code for large integer arithmetic in the default data format
-;
-; This file contains alternative routines that pretend that each digit is only 3 words.
-; This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
-; The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
-;
-; Most of this code is a direct copy of the default code.
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-#include "ksarm64.h"
-
-; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
-; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
-#define SYMCRYPT_MASM
-#include "C_asm_shared.inc"
-#undef SYMCRYPT_MASM
-
-#include "symcrypt_version.inc"
-#include "symcrypt_name_mangling.inc"
-#include "symcrypt_magic.inc"
-
-; A digit consists of 3 words of 64 bits each
-
-;UINT32
-;SYMCRYPT_CALL
-; SymCryptFdef369RawAdd(
-;   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-;   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-;   _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-;                                                           UINT32      nDigits );
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       pSrc2       -> x1
-;       pDst        -> x2
-;       nDigits     -> x3
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
-
-    neg     x3, x3                  ; negate the digit count
-    ands    x4, x4, x4              ; Zero the carry flag
-
-SymCryptFdef369RawAddAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; carry is in the carry flag
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    adcs    x4, x4, x5
-    adcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    ldr     x4, [x0], #8
-    ldr     x5, [x1], #8
-    adcs    x4, x4, x5
-    str     x4, [x2], #8
-
-    cbnz    x3, SymCryptFdef369RawAddAsmLoop
-
-    csetcs  x0                      ; Set the return value equal to the carry
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdef369RawSub(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
-;                                                            UINT32      nDigits )
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       pSrc2       -> x1
-;       pDst        -> x2
-;       nDigits     -> x3
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
-
-    neg     x3, x3                  ; negate the digit count
-    subs    x4, x4, x4              ; Set the carry flag (i.e. no borrow)
-
-SymCryptFdef369RawSubAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; borrow is in the carry flag (flipped)
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    ldr     x4, [x0], #8
-    ldr     x5, [x1], #8
-    sbcs    x4, x4, x5
-    str     x4, [x2], #8
-
-    cbnz    x3, SymCryptFdef369RawSubAsmLoop
-
-    csetcc  x0                      ; If the carry is clear (borrow), set the return value to 1
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdef369MaskedCopy(
-;    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
-;    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask )
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
-
-    neg     x2, x2                  ; negate the digit count
-    subs    x4, XZR, x3             ; If (x3 > 0) clear the carry flag (i.e. borrow)
-
-SymCryptFdef369MaskedCopyAsmLoop
-    add     x2, x2, #1              ; Increment the digit count by one
-
-    ldp     x4, x6, [x0], #16       ; Load two words of the source
-    ldp     x5, x7, [x1]            ; Load two words of the destination
-    cselcc  x4, x4, x5              ; If the carry is clear, select the source operands
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16       ; Store the two words in the destination
-
-    ldr     x4, [x0], #8
-    ldr     x5, [x1]
-    cselcc  x4, x4, x5
-    str     x4, [x1], #8
-
-    cbnz    x2, SymCryptFdef369MaskedCopyAsmLoop
-
-    ; Done, no return value
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdef369RawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       nDigits1    -> x1
-;       pSrc2       -> x2
-;       nDigits2    -> x3
-;       pDst        -> x4
-;
-; Basic structure:
-;   for each word in Src1:
-;       Dst += Src2 * word
-;
-; Register assignments
-;       x0  = pSrc1 (moving forward one word every outer loop)
-;       x1  = negated word count of pSrc1
-;       x2  = pSrc2 (moving forward one *digit* every inner loop)
-;       x3  = negated digit count of pSrc2 and pDst
-;       x4  = pDst (moving forward one *digit* every inner loop)
-;       x5  = Stored pDst (moving forward one word every outer loop)
-;       x6  = Current word loaded from pSrc1
-;       x8, x9   = Current words loaded in pairs from pSrc2
-;       x10, x11 = Current words loaded in pairs from pDst
-;       x12, x15 = "128-bit" sliding register to hold the result of multiplies
-;       x16 = Stored pSrc2
-;       x17 = Stored negated digit count of pSrc2
-; Note x13, x14 are reserved in ARM64EC and thus are not used
-
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
-
-    add     x1, x1, x1, LSL #1          ; Calculate word count (x1 * 3)
-
-    neg     x1, x1                      ; negate nWords1
-    neg     x3, x3                      ; negate nDigits2
-
-    mov     x5, x4                      ; store pDst
-    mov     x16, x2                     ; store pSrc2
-    mov     x17, x3                     ; store -nDigits2 for later
-
-    ;
-    ; First iteration of main loop (no adding of previous values from pDst)
-    ;
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0]                    ; load the first word from pSrc1
-
-SymCryptFdef369RawMulAsmLoopInner1
-    add     x3, x3, #1                  ; move one digit up
-
-    ldp     x8, x9, [x2], #16           ; load 2 words from pSrc2
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
-    str     x12, [x4], #8               ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
-    str     x12, [x4], #8               ; Store to destination
-
-    ldr     x8, [x2], #8
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
-    str     x12, [x4], #8               ; Store to destination
-
-    cbnz    x3, SymCryptFdef369RawMulAsmLoopInner1
-
-    adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
-    str     x15, [x4]
-
-    add     x1, x1, #1                  ; move one word up
-    add     x0, x0, #8                  ; move start of pSrc1 one word up
-    add     x5, x5, #8                  ; move start of pDst one word up
-
-    ;
-    ; MAIN LOOP
-    ;
-SymCryptFdef369RawMulAsmLoopOuter
-    mov     x3, x17                     ; set -nDigits2
-    mov     x2, x16                     ; set pSrc2
-    mov     x4, x5                      ; set pDst
-
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0]                    ; load the next word from pSrc1
-
-SymCryptFdef369RawMulAsmLoopInner
-    add     x3, x3, #1                  ; move one digit up
-
-    ldp     x8, x9, [x2], #16           ; load 2 words from pSrc2
-    ldp     x10, x11, [x4]              ; load 2 words from pDst
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-                                        ; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
-    adds    x12, x12, x10               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4], #8               ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-    adds    x12, x12, x11               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4], #8               ; Store to destination
-
-    ldr     x8, [x2], #8
-    ldr     x10, [x4]
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-    adds    x12, x12, x10               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4], #8               ; Store to destination
-
-    cbnz    x3, SymCryptFdef369RawMulAsmLoopInner
-
-    adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
-    str     x15, [x4]
-
-    adds    x1, x1, #1                  ; move one word up
-    add     x0, x0, #8                  ; move start of pSrc1 one word up
-    add     x5, x5, #8                  ; move start of pDst one word up
-
-    bne     SymCryptFdef369RawMulAsmLoopOuter
-
-    ; Done, no return value
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
-
-;VOID
-;SymCryptFdef369MontgomeryReduceAsm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-;
-; Initial inputs to registers:
-;       pmMod       -> x0
-;       pSrc        -> x1
-;       pDst        -> x2
-;
-; Register assignments
-;       x0  = pMod (moving forward one *digit* every inner loop)
-;       x1  = pSrc (moving forward one *digit* every inner loop)
-;       x2  = pDst (used only in the end for subtract / result)
-;       x3  = negated digit count of pSrc and pMod
-;       x4  = negated word count of pSrc
-;       x5  = Inv64 of the modulus
-;       x6  = m = pSrc[i]*Inv64
-;       x7  = hc = high carry variable
-;       x8, x9   = Current words loaded in pairs from pSrc
-;       x10, x11 = Current words loaded in pairs from pMod
-;       x12, x15 = c variable = "128-bit" sliding register to hold the result of multiplies
-;       x16 = Temporary intermediate result
-;       x17 = Stored negated digit count of pSrc
-;       x19 = Stored pMod pointer
-;       x20 = Stored pSrc pointer (moving forward one word every outer loop)
-; Note x13, x14 are reserved in ARM64EC and thus are not used
-
-    NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
-    PROLOG_SAVE_REG_PAIR fp, lr, #-32!  ; allocate 32 bytes of stack; store FP/LR
-    PROLOG_SAVE_REG_PAIR x19, x20, #16  ; free up x19/x20
-
-    ldr     w3, [x0, #SymCryptModulusNdigitsOffsetArm64]            ; # of Digits
-    ldr     x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64]    ; Inv64 of modulus
-    add     x0, x0, #SymCryptModulusValueOffsetArm64                ; pMod
-
-    add     x4, x3, x3, LSL #1          ; Calculate word count (x3 * 3)
-
-    neg     x3, x3                      ; Negate the digit count
-    neg     x4, x4                      ; Negate the word count
-
-    mov     x17, x3                     ; Store the digit count for later
-    mov     x19, x0                     ; Store the pMod pointer
-    mov     x20, x1                     ; Store the pSrc pointer
-
-    ands    x7, x7, XZR                 ; Set hc to 0
-
-    ;
-    ; Main loop
-    ;
-SymCryptFdef369MontgomeryReduceAsmOuter
-    ldr     x8, [x1]                    ; Load 1 word from pSrc
-    mul     x6, x8, x5                  ; <63:0> bits of pSrc[i]*Inv64 = m
-
-    ands    x12, x12, XZR               ; Set c to 0
-    ands    x15, x15, XZR               ; Set c to 0
-
-SymCryptFdef369MontgomeryReduceAsmInner
-    ldp     x10, x11, [x0], #16         ; pMod[j]
-    ldp     x8, x9, [x1]                ; pSrc[j]
-
-    mul     x16, x6, x10                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x8                ; Adding pSrc[j]
-    umulh   x15, x6, x10                ; <127:64> of pMod[j]*m
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    adds    x12, x12, x16               ; Add the lower bits of c
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    ; ***: These cannot produce extra carry as the maximum is
-    ;      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
-    str     x12, [x1], #8               ; pSrc[j] = (UINT64) c
-    mov     x12, x15                    ; c >>= 64
-
-    mul     x16, x6, x11                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x9                ; Adding pSrc[j]
-    umulh   x15, x6, x11                ; <127:64> of pMod[j]*m
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    adds    x12, x12, x16               ; Add the lower bits of c
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    str     x12, [x1], #8               ; pSrc[j] = (UINT64) c
-    mov     x12, x15                    ; c >>= 64
-
-    ldr     x10, [x0], #8               ; pMod[j]
-    ldr     x8, [x1]                    ; pSrc[j]
-
-    mul     x16, x6, x10                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x8                ; Adding pSrc[j]
-    umulh   x15, x6, x10                ; <127:64> of pMod[j]*m
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    adds    x12, x12, x16               ; Add the lower bits of c
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    str     x12, [x1], #8               ; pSrc[j] = (UINT64) c
-    mov     x12, x15                    ; c >>= 64
-
-    adds    x3, x3, #1                  ; Move one digit up
-    bne     SymCryptFdef369MontgomeryReduceAsmInner
-
-    ldr     x8, [x1]                    ; pSrc[nWords]
-    adds    x12, x12, x8                ; c + pSrc[nWords]
-    adc     x15, XZR, XZR               ; Add the carry if any
-
-    adds    x12, x12, x7                ; c + pSrc[nWords] + hc
-    adc     x7, x15, XZR                ; Add the carry if any and store into hc
-
-    str     x12, [x1]                   ; pSrc[nWords] = c
-
-    adds    x4, x4, #1                  ; Move one word up
-
-    add     x20, x20, #8                ; Move stored pSrc pointer one word up
-    mov     x0, x19                     ; Restore pMod pointer
-    mov     x1, x20                     ; Restore pSrc pointer
-
-    mov     x3, x17                     ; Restore the digit counter
-
-    bne     SymCryptFdef369MontgomeryReduceAsmOuter
-
-    ;
-    ; Subtraction
-    ;
-
-    mov     x16, x2                 ; Store pDst pointer
-
-    ; Prepare the pointers for subtract
-    mov     x0, x20                 ; pSrc
-    mov     x1, x19                 ; pMod
-
-    mov     x10, x7                 ; x10 = hc
-    mov     x3, x17                 ; Restore the digit counter
-    subs    x4, x4, x4              ; Set the carry flag (i.e. no borrow)
-
-SymCryptFdef369MontgomeryReduceRawSubAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; borrow is in the carry flag (flipped)
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    ldr     x4, [x0], #8
-    ldr     x5, [x1], #8
-    sbcs    x4, x4, x5
-    str     x4, [x2], #8
-
-    cbnz    x3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
-
-    csetcc  x0                      ; If the carry is clear (borrow), set the return value to 1
-
-    orr     x11, x10, x0            ; x11 = hc|d
-
-    ; Prepare the pointers for masked copy
-    mov     x0, x20                 ; pSrc
-    mov     x1, x16                 ; pDst
-
-    mov     x2, x17                 ; Restore the digit counter
-    subs    x4, x10, x11            ; If (x11 > x10) clear the carry flag (i.e. borrow)
-
-SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
-    add     x2, x2, #1              ; Increment the digit count by one
-
-    ldp     x4, x6, [x0], #16       ; Load two words of the source
-    ldp     x5, x7, [x1]            ; Load two words of the destination
-    cselcc  x4, x4, x5              ; If the carry is clear, select the source operands
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16       ; Store the two words in the destination
-
-    ldr     x4, [x0], #8
-    ldr     x5, [x1]
-    cselcc  x4, x4, x5
-    str     x4, [x1], #8
-
-    cbnz    x2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
-
-    ; Done, no return value
-
-    EPILOG_RESTORE_REG_PAIR x19, x20, #16
-    EPILOG_RESTORE_REG_PAIR fp, lr, #32!
-    EPILOG_RETURN
-
-    NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
-
-    END
-
--- a/lib/arm64/fdef369_asm.symcryptasm
+++ b/lib/arm64/fdef369_asm.symcryptasm
@ -0,0 +1,465 @@
+//
+//  fdef369_asm.symcryptasm   Assembler code for large integer arithmetic in the default data format
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// This file contains alternative routines that pretend that each digit is only 3 words.
+// This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
+// The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
+//
+// Most of this code is a direct copy of the default code.
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#include "symcryptasm_shared.cppasm"
+
+// A digit consists of 3 words of 64 bits each
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdef369RawAddAsm(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm), 4, 8)
+
+    ldp     X_4, X_6, [X_0]         // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1]         // Load two words of pSrc2
+    adds    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2]         // Store the result in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldr     X_5, [X_1, #16]         // Load one word of pSrc2
+    adcs    X_4, X_4, X_5
+    str     X_4, [X_2, #16]         // Store the result in the destination
+
+    cbz     X_3, SymCryptFdef369RawAddAsmEnd
+
+LABEL(SymCryptFdef369RawAddAsmLoop)
+    // carry is in the carry flag
+    // only update pointers to srcs and destination once per loop to reduce uops and dependencies
+    ldp     X_4, X_6, [X_0, #24]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #24]!   // Load two words of pSrc2
+    adcs    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #24]!   // Store the result in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldr     X_5, [X_1, #16]         // Load one word of pSrc2
+    adcs    X_4, X_4, X_5
+    str     X_4, [X_2, #16]         // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdef369RawAddAsmLoop
+
+    ALIGN(4)
+LABEL(SymCryptFdef369RawAddAsmEnd)
+    cset    X_0, cs                 // Set the return value equal to the carry
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm))
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdef369RawSubAsm(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm), 4, 8)
+
+    ldp     X_4, X_6, [X_0]         // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1]         // Load two words of pSrc2
+    subs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2]         // Store the result in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldr     X_5, [X_1, #16]         // Load one word of pSrc2
+    sbcs    X_4, X_4, X_5
+    str     X_4, [X_2, #16]         // Store the result in the destination
+
+    cbz     X_3, SymCryptFdef369RawSubAsmEnd
+
+LABEL(SymCryptFdef369RawSubAsmLoop)
+    // borrow is in the carry flag (flipped)
+    // only update pointers to srcs and destination once per loop to reduce uops and dependencies
+    ldp     X_4, X_6, [X_0, #24]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #24]!   // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #24]!   // Store the result in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldr     X_5, [X_1, #16]         // Load one word of pSrc2
+    sbcs    X_4, X_4, X_5
+    str     X_4, [X_2, #16]         // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdef369RawSubAsmLoop
+
+    ALIGN(4)
+LABEL(SymCryptFdef369RawSubAsmEnd)
+    cset    X_0, cc                 // If the carry is clear (borrow), set the return value to 1
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdef369MaskedCopyAsm(
+//    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
+//    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
+//                                                                UINT32      nDigits,
+//                                                                UINT32      mask )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm), 4, 7)
+
+    subs    xzr, xzr, X_3           // If (X_3 > 0) clear the carry flag (i.e. borrow)
+
+    ldp     X_3, X_5, [X_0]         // Load two words of the source
+    ldp     X_4, X_6, [X_1]         // Load two words of the destination
+    csel    X_3, X_3, X_4, cc       // If the carry is clear, select the source operand
+    csel    X_5, X_5, X_6, cc
+    stp     X_3, X_5, [X_1]         // Store the two words in the destination
+
+    ldr     X_3, [X_0, #16]         // Load one word of the source
+    sub     X_2, X_2, #1            // Decrement the digit count by one
+    ldr     X_4, [X_1, #16]         // Load one word of the destination
+    csel    X_3, X_3, X_4, cc
+    str     X_3, [X_1, #16]         // Store the one word in the destination
+
+    cbz     X_2, SymCryptFdef369MaskedCopyAsmEnd
+
+LABEL(SymCryptFdef369MaskedCopyAsmLoop)
+    ldp     X_3, X_5, [X_0, #24]!   // Load two words of the source
+    ldp     X_4, X_6, [X_1, #24]!   // Load two words of the destination
+    csel    X_3, X_3, X_4, cc       // If the carry is clear, select the source operand
+    csel    X_5, X_5, X_6, cc
+    stp     X_3, X_5, [X_1]         // Store the two words in the destination
+
+    ldr     X_3, [X_0, #16]         // Load one word of the source
+    sub     X_2, X_2, #1            // Decrement the digit count by one
+    ldr     X_4, [X_1, #16]         // Load one word of the destination
+    csel    X_3, X_3, X_4, cc
+    str     X_3, [X_1, #16]         // Store the one word in the destination
+
+    cbnz    X_2, SymCryptFdef369MaskedCopyAsmLoop
+
+LABEL(SymCryptFdef369MaskedCopyAsmEnd)
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdef369RawMulAsm(
+//    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                    UINT32      nDigits1,
+//    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                    UINT32      nDigits2,
+//    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
+//
+// Basic structure:
+//   for each word in Src1:
+//       Dst += Src2 * word
+//
+// Register assignments
+//       X_0  = pSrc1 (moving forward one word every outer loop)
+//       X_1  = word count of pSrc1
+//       X_2  = pSrc2 (moving forward one *digit* every inner loop)
+//       X_3  = digit count of pSrc2 and pDst
+//       X_4  = pDst (moving forward one *digit* every inner loop)
+//       X_5  = Stored pDst (moving forward one word every outer loop)
+//       X_6  = Current word loaded from pSrc1
+//       X_7, X_8   = Current words loaded in pairs from pSrc2
+//       X_9, X_10  = Current words loaded in pairs from pDst
+//       X_11, X_12 = Scratch registers for holding the results of multiplies
+//       X_13 = Stored pSrc2
+//       X_14 = Stored digit count of pSrc2
+//       X_15 = Scratch register for holding the results of multiplies
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm), 5, 16)
+
+    add     X_1, X_1, X_1, LSL #1       // Calculate word count (X_1 * 3)
+
+    sub     X_2, X_2, #24               // offset pSrc2 so we can use pre-increment form of loads
+    sub     X_4, X_4, #24               // offset pDst so we can use pre-increment form of loads
+
+    mov     X_5, X_4                    // store pDst
+    mov     X_13, X_2                   // store pSrc2
+    mov     X_14, X_3                   // store nDigits2 for later
+
+    //
+    // First iteration of main loop (no adding of previous values from pDst)
+    //
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0]                  // load the first word from pSrc1
+
+LABEL(SymCryptFdef369RawMulAsmLoopInner1)
+    sub     X_3, X_3, #1                // move one digit up
+
+    ldp     X_7, X_8, [X_2, #24]!       // load 2 words from pSrc2
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[0]*pSrc2[j]
+    adcs    X_11, X_11, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_7              // Bits <127:64> of pSrc1[0]*pSrc2[j]
+
+    mul     X_15, X_6, X_8              // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
+    adcs    X_15, X_15, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
+
+    stp     X_11, X_15, [X_4, #24]!     // Store to destination
+    ldr     X_7, [X_2, #16]             // load 1 word from pSrc2
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
+    adcs    X_11, X_11, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_7              // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
+
+    str     X_11, [X_4, #16]            // Store to destination
+
+    cbnz    X_3, SymCryptFdef369RawMulAsmLoopInner1
+
+    adc     X_12, X_12, xzr             // Store the next word into the destination (with the carry if any)
+    str     X_12, [X_4, #24]
+
+    sub     X_1, X_1, #1                // move one word up
+    add     X_0, X_0, #8                // move start of pSrc1 one word up
+    add     X_5, X_5, #8                // move start of pDst one word up
+
+    //
+    // MAIN LOOP
+    //
+LABEL(SymCryptFdef369RawMulAsmLoopOuter)
+    mov     X_3, X_14                   // set nDigits2
+    mov     X_2, X_13                   // set pSrc2
+    mov     X_4, X_5                    // set pDst
+
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0]                  // load the next word from pSrc1
+
+LABEL(SymCryptFdef369RawMulAsmLoopInner)
+    sub     X_3, X_3, #1                // move one digit up
+
+    ldp     X_7, X_8, [X_2, #24]!       // load 2 words from pSrc2
+    ldp     X_9, X_10, [X_4, #24]!      // load 2 words from pDst
+
+    adcs    X_9, X_9, X_12              // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_11, X_6, X_7              // Bits <127:64> of pSrc1[i]*pSrc2[j]
+    adcs    X_10, X_11, X_10            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
+    adc     X_12, X_12, xzr             // Add the carry if any and don't update the flags
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[i]*pSrc2[j]
+    adds    X_9, X_9, X_11              // add the word from the destination and update the flags (this can overflow)
+    mul     X_11, X_6, X_8              // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
+    adcs    X_10, X_10, X_11            // add the word from the destination and update the flags (this can overflow)
+
+    stp     X_9, X_10, [X_4]            // Store to destination
+
+    ldr     X_7, [X_2, #16]             // load 1 word from pSrc2
+    ldr     X_9, [X_4, #16]             // load 1 word from pDst
+
+    adcs    X_9, X_9, X_12              // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_7              // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
+    adc     X_12, X_12, xzr             // Add the carry if any and don't update the flags
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
+    adds    X_9, X_9, X_11              // add the word from the destination and update the flags (this can overflow)
+
+    str     X_9, [X_4, #16]             // Store to destination
+
+    cbnz    X_3, SymCryptFdef369RawMulAsmLoopInner
+
+    adc     X_12, X_12, xzr             // Store the next word into the destination (with the carry if any)
+    str     X_12, [X_4, #24]
+
+    subs    X_1, X_1, #1                // move one word up
+    add     X_0, X_0, #8                // move start of pSrc1 one word up
+    add     X_5, X_5, #8                // move start of pDst one word up
+
+    bne     SymCryptFdef369RawMulAsmLoopOuter
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm))
+
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdef369MontgomeryReduceAsm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _Inout_                         PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+//
+// Register assignments
+//       X_0  = pMod (moving forward one *digit* every inner loop)
+//       X_1  = pSrc (moving forward one *digit* every inner loop)
+//       X_2  = pDst (used only in the end for subtract / result)
+//       X_3  = digit count of pSrc and pMod
+//       X_4  = word count of pSrc
+//       X_5  = Inv64 of the modulus
+//       X_6  = m = pSrc[i]*Inv64
+//       X_7  = hc = high carry variable
+//       X_8, X_9   = Current words loaded in pairs from pSrc
+//       X_10, X_11 = Current words loaded in pairs from pMod
+//       X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
+//                  It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
+//       X_14 = Temporary intermediate result
+//       X_15 = Stored digit count of pSrc
+//       X_16 = Stored pMod pointer
+//       X_17 = Stored pSrc pointer (moving forward one word every outer loop)
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
+
+    ldr     W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64]          // # of Digits
+    ldr     X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64]  // Inv64 of modulus
+    add     X_0, X_0, #SymCryptModulusValueOffsetArm64              // pMod
+
+    add     X_4, X_3, X_3, LSL #1       // Calculate word count (X_3 * 3)
+
+    sub     X_0, X_0, #24               // offset pMod so we can use pre-increment form of loads
+    sub     X_1, X_1, #24               // offset pSrc so we can use pre-increment form of loads
+    sub     X_2, X_2, #24               // offset pDst so we can use pre-increment form of loads
+
+    mov     X_15, X_3                   // Store the digit count for later
+    mov     X_16, X_0                   // Store the pMod pointer
+    mov     X_17, X_1                   // Store the pSrc pointer
+
+    and     X_7, X_7, xzr               // Set hc to 0
+
+    //
+    // Main loop
+    //
+LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
+    ldr     X_8, [X_1, #24]             // Load 1 word from pSrc
+    mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
+
+    and     X_12, X_12, xzr             // Set c to 0
+
+LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
+    ldp     X_10, X_11, [X_0, #24]!     // pMod[j]
+    ldp     X_8, X_9, [X_1, #24]!       // pSrc[j]
+
+    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adds    X_12, X_12, X_14            // Add the lower bits of c
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    // ***: These cannot produce extra carry as the maximum is
+    //      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
+    str     X_12, [X_1]                 // pSrc[j] = (UINT64) c4) c
+    mov     X_12, X_13                  // c >>= 64
+
+    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_9             // Adding pSrc[j]
+    umulh   X_13, X_6, X_11             // <127:64> of pMod[j]*m
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adds    X_12, X_12, X_14            // Add the lower bits of c
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    str     X_12, [X_1, #8]             // pSrc[j] = (UINT64) c
+    mov     X_12, X_13                  // c >>= 64
+
+    ldr     X_10, [X_0, #16]            // pMod[j]
+    ldr     X_8, [X_1, #16]             // pSrc[j]
+
+    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adds    X_12, X_12, X_14            // Add the lower bits of c
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    str     X_12, [X_1, #16]            // pSrc[j] = (UINT64) c4) c
+    mov     X_12, X_13                  // c >>= 64
+
+    subs    X_3, X_3, #1                // Move one digit up
+    bne     SymCryptFdef369MontgomeryReduceAsmInner
+
+    ldr     X_8, [X_1, #24]             // pSrc[nWords]
+    adds    X_12, X_12, X_8             // c + pSrc[nWords]
+    adc     X_13, xzr, xzr              // Add the carry if any
+
+    adds    X_12, X_12, X_7             // c + pSrc[nWords] + hc
+    adc     X_7, X_13, xzr              // Add the carry if any and store into hc
+
+    str     X_12, [X_1, #24]            // pSrc[nWords] = c
+
+    subs    X_4, X_4, #1                // Move one word up
+
+    add     X_17, X_17, #8              // Move stored pSrc pointer one word up
+    mov     X_0, X_16                   // Restore pMod pointer
+    mov     X_1, X_17                   // Restore pSrc pointer
+
+    mov     X_3, X_15                   // Restore the digit counter
+
+    bne     SymCryptFdef369MontgomeryReduceAsmOuter
+
+    //
+    // Subtraction
+    //
+
+    mov     X_14, X_2               // Store pDst pointer
+
+    // Prepare the pointers for subtract
+    mov     X_0, X_17               // pSrc
+    mov     X_1, X_16               // pMod
+
+    mov     X_10, X_7               // X_10 = hc
+    mov     X_3, X_15               // Restore the digit counter
+    subs    X_4, X_4, X_4           // Set the carry flag (i.e. no borrow)
+
+LABEL(SymCryptFdef369MontgomeryReduceRawSubAsmLoop)
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    // borrow is in the carry flag (flipped)
+
+    ldp     X_4, X_6, [X_0, #24]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #24]!   // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #24]!   // Store the result in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of pSrc1
+    ldr     X_5, [X_1, #16]         // Load one word of pSrc2
+    sbcs    X_4, X_4, X_5
+    str     X_4, [X_2, #16]         // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
+
+    cset    X_0, cc                 // If the carry is clear (borrow), set the return value to 1
+
+    orr     X_11, X_10, X_0         // X_11 = hc|d
+
+    // Prepare the pointers for masked copy
+    mov     X_0, X_17               // pSrc
+    mov     X_1, X_14               // pDst
+
+    mov     X_2, X_15               // Restore the digit counter
+    subs    X_4, X_10, X_11         // If (X_11 > X_10) clear the carry flag (i.e. borrow)
+
+LABEL(SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop)
+    sub     X_2, X_2, #1            // decrement the digit count by one
+
+    ldp     X_4, X_6, [X_0, #24]!   // Load two words of the source
+    ldp     X_5, X_7, [X_1, #24]!   // Load two words of the destination
+    csel    X_4, X_4, X_5, cc       // If the carry is clear, select the source operands
+    csel    X_6, X_6, X_7, cc
+    stp     X_4, X_6, [X_1]         // Store the two words in the destination
+
+    ldr     X_4, [X_0, #16]         // Load one word of the source
+    ldr     X_5, [X_1, #16]         // Load one word of the destination
+    csel    X_4, X_4, X_5, cc       // If the carry is clear, select the source operands
+    str     X_4, [X_1, #16]         // Store the one word in the destination
+
+    cbnz    X_2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm))
+
+    FILE_END()
--- a/lib/arm64/fdef_asm.asm
+++ b/lib/arm64/fdef_asm.asm
@ -1,768 +0,0 @@
-;
-;  fdef_asm.asm   Assembler code for large integer arithmetic in the default data format for the arm64 architecture
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-
-#include "ksarm64.h"
-
-; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
-; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
-#define SYMCRYPT_MASM
-#include "C_asm_shared.inc"
-#undef SYMCRYPT_MASM
-
-#include "symcrypt_version.inc"
-#include "symcrypt_name_mangling.inc"
-#include "symcrypt_magic.inc"
-
-; A digit consists of 4 words of 64 bits each
-
-;UINT32
-;SYMCRYPT_CALL
-; SymCryptFdefRawAdd(
-;   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-;   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-;   _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-;                                                           UINT32      nDigits );
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       pSrc2       -> x1
-;       pDst        -> x2
-;       nDigits     -> x3
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
-
-    neg     x3, x3                  ; negate the digit count
-    ands    x4, x4, x4              ; Zero the carry flag
-
-SymCryptFdefRawAddAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; carry is in the carry flag
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    adcs    x4, x4, x5
-    adcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    adcs    x4, x4, x5
-    adcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    cbnz    x3, SymCryptFdefRawAddAsmLoop
-
-    csetcs  x0                      ; Set the return value equal to the carry
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
-
-;UINT32
-;SYMCRYPT_CALL
-;SymCryptFdefRawSub(
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
-;    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
-;    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
-;                                                            UINT32      nDigits )
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       pSrc2       -> x1
-;       pDst        -> x2
-;       nDigits     -> x3
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
-
-    neg     x3, x3                  ; negate the digit count
-    subs    x4, x4, x4              ; Set the carry flag (i.e. no borrow)
-
-SymCryptFdefRawSubAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; borrow is in the carry flag (flipped)
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    cbnz    x3, SymCryptFdefRawSubAsmLoop
-
-    csetcc  x0                      ; If the carry is clear (borrow), set the return value to 1
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefMaskedCopy(
-;    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
-;    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
-;                                                                UINT32      nDigits,
-;                                                                UINT32      mask )
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
-
-    neg     x2, x2                  ; negate the digit count
-    subs    x4, XZR, x3             ; If (x3 > 0) clear the carry flag (i.e. borrow)
-
-SymCryptFdefMaskedCopyAsmLoop
-    add     x2, x2, #1              ; Increment the digit count by one
-
-    ldp     x4, x6, [x0], #16       ; Load two words of the source
-    ldp     x5, x7, [x1]            ; Load two words of the destination
-    cselcc  x4, x4, x5              ; If the carry is clear, select the source operands
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16       ; Store the two words in the destination
-
-    ldp     x4, x6, [x0], #16
-    ldp     x5, x7, [x1]
-    cselcc  x4, x4, x5
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16
-
-    cbnz    x2, SymCryptFdefMaskedCopyAsmLoop
-
-    ; Done, no return value
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
-
-;VOID
-;SYMCRYPT_CALL
-;SymCryptFdefRawMul(
-;    _In_reads_(nWords1)             PCUINT32    pSrc1,
-;                                    UINT32      nDigits1,
-;    _In_reads_(nWords2)             PCUINT32    pSrc2,
-;                                    UINT32      nDigits2,
-;    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
-;
-; Initial inputs to registers:
-;       pSrc1       -> x0
-;       nDigits1    -> x1
-;       pSrc2       -> x2
-;       nDigits2    -> x3
-;       pDst        -> x4
-;
-; Basic structure:
-;   for each word in Src1:
-;       Dst += Src2 * word
-;
-; Register assignments
-;       x0  = pSrc1 (moving forward one word every outer loop)
-;       x1  = negated word count of pSrc1
-;       x2  = pSrc2 (moving forward one *digit* every inner loop)
-;       x3  = negated digit count of pSrc2 and pDst
-;       x4  = pDst (moving forward one *digit* every inner loop)
-;       x5  = Stored pDst (moving forward one word every outer loop)
-;       x6  = Current word loaded from pSrc1
-;       x8, x9   = Current words loaded in pairs from pSrc2
-;       x10, x11 = Current words loaded in pairs from pDst
-;       x12, x15 = "128-bit" sliding register to hold the result of multiplies
-;       x16 = Stored pSrc2
-;       x17 = Stored negated digit count of pSrc2
-; Note x13, x14 are reserved in ARM64EC and thus are not used
-
-
-    LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
-
-    lsl     x1, x1, #2                  ; Calculate word count
-
-    neg     x1, x1                      ; negate nWords1
-    neg     x3, x3                      ; negate nDigits2
-
-    mov     x5, x4                      ; store pDst
-    mov     x16, x2                     ; store pSrc2
-    mov     x17, x3                     ; store -nDigits2 for later
-
-    ;
-    ; First iteration of main loop (no adding of previous values from pDst)
-    ;
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0]                    ; load the first word from pSrc1
-
-SymCryptFdefRawMulAsmLoopInner1
-    add     x3, x3, #1                  ; move one digit up
-
-    ldp     x8, x9, [x2]                ; load 2 words from pSrc2
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
-    str     x12, [x4]                   ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
-    str     x12, [x4, #8]               ; Store to destination
-
-    ldp     x8, x9, [x2, #16]           ; load 2 words from pSrc2
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
-    str     x12, [x4, #16]              ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[0]*pSrc2[j+3]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[0]*pSrc2[j+3]
-    str     x12, [x4, #24]              ; Store to destination
-
-    add     x2, x2, #32
-    add     x4, x4, #32
-
-    cbnz    x3, SymCryptFdefRawMulAsmLoopInner1
-
-    adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
-    str     x15, [x4]
-
-    add     x1, x1, #1                  ; move one word up
-    add     x0, x0, #8                  ; move start of pSrc1 one word up
-    add     x5, x5, #8                  ; move start of pDst one word up
-
-    ;
-    ; MAIN LOOP
-    ;
-SymCryptFdefRawMulAsmLoopOuter
-    mov     x3, x17                     ; set -nDigits2
-    mov     x2, x16                     ; set pSrc2
-    mov     x4, x5                      ; set pDst
-
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0]                    ; load the next word from pSrc1
-
-SymCryptFdefRawMulAsmLoopInner
-    add     x3, x3, #1                  ; move one digit up
-
-    ldp     x8, x9, [x2]                ; load 2 words from pSrc2
-    ldp     x10, x11, [x4]              ; load 2 words from pDst
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-                                        ; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
-    adds    x12, x12, x10               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4]                   ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-    adds    x12, x12, x11               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4, #8]               ; Store to destination
-
-    ldp     x8, x9, [x2, #16]           ; load 2 words from pSrc2
-    ldp     x10, x11, [x4, #16]         ; load 2 words from pDst
-
-    mul     x12, x6, x8                 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x8                 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-    adds    x12, x12, x10               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4, #16]              ; Store to destination
-
-    mul     x12, x6, x9                 ; Bits <63:0> of pSrc1[i]*pSrc2[j+3]
-    adcs    x12, x12, x15               ; Adding the previous word (if there was a carry from the last addition it is added)
-    umulh   x15, x6, x9                 ; Bits <127:64> of pSrc1[i]*pSrc2[j+3]
-    adc     x15, x15, XZR               ; Add the carry if any and don't update the flags
-    adds    x12, x12, x11               ; add the word from the destination and update the flags (this can overflow)
-    str     x12, [x4, #24]              ; Store to destination
-
-    add     x2, x2, #32
-    add     x4, x4, #32
-
-    cbnz    x3, SymCryptFdefRawMulAsmLoopInner
-
-    adc     x15, x15, XZR               ; Store the next word into the destination (with the carry if any)
-    str     x15, [x4]
-
-    adds    x1, x1, #1                  ; move one word up
-    add     x0, x0, #8                  ; move start of pSrc1 one word up
-    add     x5, x5, #8                  ; move start of pDst one word up
-
-    bne     SymCryptFdefRawMulAsmLoopOuter
-
-    ; Done, no return value
-
-    ret
-
-    LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
-
-
-
-
-
-
-    ; Macro for the first loop of the first pass of RawSquareAsm.
-    ; It takes one word from the source, multiplies it with the mulword,
-    ; adds the high level word of the previous macro call, and stores it into
-    ; the destination.
-    ;
-    ; No carry flag is propagated from the previous macro call as the maximum is
-    ; (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
-    MACRO
-    SQR_SINGLEADD_64 $index
-
-        ldr     x8, [x2, #8*$index]         ; pSrc[i+j]
-
-        mul     x12, x6, x8                 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
-        adds    x12, x12, x15               ; Adding the previous word
-        umulh   x15, x6, x8                 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
-        adc     x15, x15, XZR               ; Add the intermediate carry and don't update the flags
-
-        str     x12, [x4, #8*$index]        ; Store to destination
-
-    MEND
-
-    ; Macro for the remaining loops of the first pass of RawSquareAsm.
-    ; The only difference to the above is that it also adds the word loaded
-    ; from the destination buffer.
-    ;
-    ; No carry flag is propagated from the previous macro call as the maximum is
-    ; (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
-    MACRO
-    SQR_DOUBLEADD_64 $index
-
-        ldr     x8, [x2, #8*$index]         ; pSrc[i+j]
-        ldr     x10, [x4, #8*$index]        ; pDst[2*(i+j)]
-
-        mul     x12, x6, x8                 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
-        adds    x12, x12, x15               ; Adding the previous word
-        umulh   x15, x6, x8                 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
-        adc     x15, x15, XZR               ; Add the intermediate carry and don't update the flags
-
-        adds    x12, x12, x10               ; Add the word from the destination
-        adc     x15, x15, XZR               ; Add the intermediate carry and don't update the flags
-
-        str     x12, [x4, #8*$index]        ; Store to destination
-
-    MEND
-
-    ; Macro for the third pass loop of RawSquareAsm.
-    ; It takes one mulword from the source, squares it, and
-    ; adds it to the even columns of the destination. The carries are propagated
-    ; to the odd columns.
-    ;
-    ; Here we can have a (1-bit) carry to the next call because the maximum value for
-    ; a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
-    MACRO
-    SQR_DIAGONAL_PROP $index
-        ldr     x6, [x0, #8*$index]         ; mulword
-        mul     x12, x6, x6                 ; Bits <63:0> of m^2
-        umulh   x15, x6, x6                 ; Bits <127:64> of m^2
-
-        ldp     x8, x9, [x4, #16*$index]    ; Load
-
-        ; Adding the square to the even column
-        adcs    x12, x12, x8                ; carry from previous and update the flags
-
-        ; Propagating the sum to the next column
-        adcs    x15, x15, x9                ; This can generate a carry
-
-        stp     x12, x15, [x4, #16*$index]  ; Store
-    MEND
-
-; VOID
-; SYMCRYPT_CALL
-; SymCryptFdefRawSquareAsm(
-;   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-;                                                       UINT32      nDigits,
-;   _Out_writes_(2*nWords)                              PUINT32     pDst )
-;
-; Initial inputs to registers:
-;       pSrc        -> x0
-;       nDigits     -> x1
-;       pDst        -> x2
-;
-; Register assignments
-;       x0  = pSrc
-;       x1  = negated word count of pSrc
-;       x2  = pSrc (moving forward one digit / 4 words every inner loop)
-;       x3  = negated digit count of pSrc
-;       x4  = pDst (moving forward one digit every inner loop)
-;       x5  = pDst (moving forward one word every outer loop)
-;       x6  = Current word loaded from pSrc
-;       x8, x9   = Current words loaded in pairs from pSrc2
-;       x10, x11 = Current words loaded in pairs from pDst
-;       x12, x15 = "128-bit" sliding register to hold the result of multiplies
-;       x16 = Stored pSrc
-;       x17 = Negated digit count of pSrc
-;       x19 = Stored negated digit count of pSrc
-;       x20 = Stored pDst
-; Note x13, x14 are reserved in ARM64EC and thus are not used
-
-
-    NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
-    PROLOG_SAVE_REG_PAIR fp, lr, #-32!  ; allocate 32 bytes of stack; store FP/LR
-    PROLOG_SAVE_REG_PAIR x19, x20, #16  ; free up x19/x20
-
-    mov     x3, x1                      ; digit count into x3
-
-    lsl     x1, x1, #2                  ; Calculate word count
-
-    neg     x1, x1                      ; negate nWords
-    neg     x3, x3                      ; negate nDigits
-
-    mov     x4, x2                      ; pDst
-    mov     x5, x2                      ; store pDst
-    mov     x20, x2                      ; store pDst
-    mov     x16, x0                     ; store pSrc
-    mov     x2, x0                      ; inner loop pSrc
-    mov     x17, x3                     ; store -nDigits for later
-    mov     x19, x3                     ; store -nDigits for later
-
-    ;
-    ; First iteration of main loop (no adding of previous values from pDst)
-    ;
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0]                    ; load the first word from pSrc1
-    str     x15, [x4]                   ; store 0 for the first word
-
-    b       SymCryptFdefRawSquareAsmInnerLoopInit_Word1
-
-SymCryptFdefRawSquareAsmInnerLoopInit_Word0
-    SQR_SINGLEADD_64    0
-
-SymCryptFdefRawSquareAsmInnerLoopInit_Word1
-    SQR_SINGLEADD_64    1
-
-    SQR_SINGLEADD_64    2
-
-    SQR_SINGLEADD_64    3
-
-    add     x3, x3, #1                  ; move one digit up
-    add     x2, x2, #32
-    add     x4, x4, #32
-
-    cbnz    x3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
-
-    str     x15, [x4]                   ; Store the next word into the destination
-
-    add     x1, x1, #1                  ; move one word up
-
-    mov     x9, #1                      ; Cyclic counter
-
-    ;
-    ; MAIN LOOP
-    ;
-SymCryptFdefRawSquareAsmOuterLoop
-
-    add     x5, x5, #8                  ; move start of pDst one word up
-
-    mov     x3, x17                     ; set -nDigits
-    mov     x2, x0                      ; set pSrc
-    mov     x4, x5                      ; set pDst
-
-    ands    x15, x15, XZR               ; Clearing the carry flag and setting x15 = 0
-    ldr     x6, [x0, x9, LSL #3]        ; load the next word from pSrc
-
-    ; Cyclic counter and jump logic
-    add     x9, x9, #1
-    cmp     x9, #1
-    beq     SymCryptFdefRawSquareAsmInnerLoop_Word1
-    cmp     x9, #2
-    beq     SymCryptFdefRawSquareAsmInnerLoop_Word2
-    cmp     x9, #3
-    beq     SymCryptFdefRawSquareAsmInnerLoop_Word3
-
-    ; The following instructions are only executed when x9 == 4
-    mov     x9, XZR                 ; Set it to 0
-
-    add     x0, x0, #32             ; move start of pSrc 4 words up
-    add     x5, x5, #32             ; move pDst 4 words up
-
-    mov     x2, x0                  ; set pSrc
-    mov     x4, x5                  ; set pDst
-
-    adds    x17, x17, #1            ; add 1 digit
-    mov     x3, x17                 ; set the new digit counter
-
-SymCryptFdefRawSquareAsmInnerLoop_Word0
-    SQR_DOUBLEADD_64    0
-
-SymCryptFdefRawSquareAsmInnerLoop_Word1
-    SQR_DOUBLEADD_64    1
-
-SymCryptFdefRawSquareAsmInnerLoop_Word2
-    SQR_DOUBLEADD_64    2
-
-SymCryptFdefRawSquareAsmInnerLoop_Word3
-    SQR_DOUBLEADD_64    3
-
-    add     x3, x3, #1                  ; move one digit up
-    add     x2, x2, #32
-    add     x4, x4, #32
-
-    cbnz    x3, SymCryptFdefRawSquareAsmInnerLoop_Word0
-
-    str     x15, [x4]                   ; Store the next word into the destination
-
-    adds    x1, x1, #1                  ; move one word up
-    cmn     x1, #1                      ; Compare with -1
-    bne     SymCryptFdefRawSquareAsmOuterLoop
-
-    ands    x15, x15, XZR               ; Setting x15 = 0
-    str     x15, [x5, #40]              ; Store 0 to destination for the top word
-
-    ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    ; Second Pass - Shifting all results 1 bit left
-    ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    mov     x3, x19         ; -nDigits
-    lsl     x3, x3, #1      ; Double digits
-    mov     x4, x20         ; pDst pointer
-    ands    x8, x8, XZR     ; Clear the flags
-
-SymCryptFdefRawSquareAsmSecondPass
-
-    add     x3, x3, #1      ; move one digit up
-
-    ldp     x8, x9, [x4]
-    adcs    x8, x8, x8      ; Shift left and add the carry
-    adcs    x9, x9, x9
-    stp     x8, x9, [x4], #16
-
-    ldp     x10, x11, [x4]
-    adcs    x10, x10, x10   ; Shift left and add the carry
-    adcs    x11, x11, x11
-    stp     x10, x11, [x4], #16
-
-    cbnz    x3, SymCryptFdefRawSquareAsmSecondPass
-
-    ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    ; Third Pass - Adding the squares on the even columns and propagating the sum
-    ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    ands    x8, x8, XZR     ; Clear the flags
-    mov     x0, x16         ; src pointer
-    mov     x4, x20         ; pDst pointer
-    mov     x3, x19         ; -nDigits
-
-SymCryptFdefRawSquareAsmThirdPass
-    SQR_DIAGONAL_PROP 0
-    SQR_DIAGONAL_PROP 1
-    SQR_DIAGONAL_PROP 2
-    SQR_DIAGONAL_PROP 3
-
-    add     x3, x3, #1          ; move one digit up
-    add     x0, x0, #32         ; One digit up (not updated in SQR_DIAGONAL_PROP)
-    add     x4, x4, #64         ; Two digits up (not updated in SQR_DIAGONAL_PROP)
-
-    cbnz    x3, SymCryptFdefRawSquareAsmThirdPass
-
-    ; Done, no return value
-
-    EPILOG_RESTORE_REG_PAIR x19, x20, #16
-    EPILOG_RESTORE_REG_PAIR fp, lr, #32!
-    EPILOG_RETURN
-
-    NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
-
-;VOID
-;SymCryptFdefMontgomeryReduceAsm(
-;    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-;    _In_                            PUINT32                 pSrc,
-;    _Out_                           PUINT32                 pDst )
-;
-; Initial inputs to registers:
-;       pmMod       -> x0
-;       pSrc        -> x1
-;       pDst        -> x2
-;
-; Register assignments
-;       x0  = pMod (moving forward one *digit* every inner loop)
-;       x1  = pSrc (moving forward one *digit* every inner loop)
-;       x2  = pDst (used only in the end for subtract / result)
-;       x3  = negated digit count of pSrc and pMod
-;       x4  = negated word count of pSrc
-;       x5  = Inv64 of the modulus
-;       x6  = m = pSrc[i]*Inv64
-;       x7  = hc = high carry variable
-;       x8, x9   = Current words loaded in pairs from pSrc
-;       x10, x11 = Current words loaded in pairs from pMod
-;       x12, x15 = c variable = "128-bit" register to hold the result of multiplies
-;                  It is flipped between [x12:x15] and [x15:x12] intstead of doing c>>=64
-;       x16 = Temporary intermediate result
-;       x17 = Stored negated digit count of pSrc
-;       x19 = Stored pMod pointer
-;       x20 = Stored pSrc pointer (moving forward one word every outer loop)
-; Note x13, x14 are reserved in ARM64EC and thus are not used
-
-    NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
-    PROLOG_SAVE_REG_PAIR fp, lr, #-32!
-    PROLOG_SAVE_REG_PAIR x19, x20, #16
-
-    ldr     w3, [x0, #SymCryptModulusNdigitsOffsetArm64]            ; # of Digits
-    ldr     x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64]    ; Inv64 of modulus
-    add     x0, x0, #SymCryptModulusValueOffsetArm64                ; pMod
-
-    lsl     x4, x3, #2                  ; Multiply by 4 to get the number of words
-
-    neg     x3, x3                      ; Negate the digit count
-    neg     x4, x4                      ; Negate the word count
-
-    mov     x17, x3                     ; Store the digit count for later
-    mov     x19, x0                     ; Store the pMod pointer
-    mov     x20, x1                     ; Store the pSrc pointer
-
-    ands    x7, x7, XZR                 ; Set hc to 0
-
-    ;
-    ; Main loop
-    ;
-SymCryptFdefMontgomeryReduceAsmOuter
-    ldr     x8, [x1]                    ; Load 1 word from pSrc
-    mul     x6, x8, x5                  ; <63:0> bits of pSrc[i]*Inv64 = m
-
-    ands    x12, x12, XZR               ; Set c to 0
-    ands    x15, x15, XZR               ; Set c to 0
-
-SymCryptFdefMontgomeryReduceAsmInner
-    ldp     x10, x11, [x0]              ; pMod[j]
-    ldp     x8, x9, [x1]                ; pSrc[j]
-
-    mul     x16, x6, x10                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x8                ; Adding pSrc[j]
-    umulh   x15, x6, x10                ; <127:64> of pMod[j]*m
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    adds    x12, x12, x16               ; Add the lower bits of c
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    ; ***: These cannot produce extra carry as the maximum is
-    ;      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
-    str     x12, [x1]                   ; pSrc[j] = (UINT64) c
-
-    mul     x16, x6, x11                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x9                ; Adding pSrc[j]
-    umulh   x12, x6, x11                ; <127:64> of pMod[j]*m
-    adc     x12, x12, XZR               ; Add the carry if any (***)
-    adds    x15, x15, x16               ; Add the lower bits of c
-    adc     x12, x12, XZR               ; Add the carry if any (***)
-    str     x15, [x1, #8]               ; pSrc[j] = (UINT64) c
-
-    ldp     x10, x11, [x0, #16]         ; pMod[j]
-    ldp     x8, x9, [x1, #16]           ; pSrc[j]
-
-    mul     x16, x6, x10                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x8                ; Adding pSrc[j]
-    umulh   x15, x6, x10                ; <127:64> of pMod[j]*m
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    adds    x12, x12, x16               ; Add the lower bits of c
-    adc     x15, x15, XZR               ; Add the carry if any (***)
-    str     x12, [x1, #16]              ; pSrc[j] = (UINT64) c
-
-    mul     x16, x6, x11                ; <63:0> of pMod[j]*m
-    adds    x16, x16, x9                ; Adding pSrc[j]
-    umulh   x12, x6, x11                ; <127:64> of pMod[j]*m
-    adc     x12, x12, XZR               ; Add the carry if any (***)
-    adds    x15, x15, x16               ; Add the lower bits of c
-    adc     x12, x12, XZR               ; Add the carry if any (***)
-    str     x15, [x1, #24]              ; pSrc[j] = (UINT64) c
-
-    add     x0, x0, #32
-    add     x1, x1, #32
-    adds    x3, x3, #1                  ; Move one digit up
-    bne     SymCryptFdefMontgomeryReduceAsmInner
-
-    ldr     x8, [x1]                    ; pSrc[nWords]
-    adds    x12, x12, x8                ; c + pSrc[nWords]
-    adc     x15, XZR, XZR               ; Add the carry if any
-
-    adds    x12, x12, x7                ; c + pSrc[nWords] + hc
-    adc     x7, x15, XZR                ; Add the carry if any and store into hc
-
-    str     x12, [x1]                   ; pSrc[nWords] = c
-
-    adds    x4, x4, #1                  ; Move one word up
-
-    add     x20, x20, #8                ; Move stored pSrc pointer one word up
-    mov     x0, x19                     ; Restore pMod pointer
-    mov     x1, x20                     ; Restore pSrc pointer
-
-    mov     x3, x17                     ; Restore the digit counter
-
-    bne     SymCryptFdefMontgomeryReduceAsmOuter
-
-    ;
-    ; Subtraction
-    ;
-
-    mov     x16, x2                 ; Store pDst pointer
-
-    ; Prepare the pointers for subtract
-    mov     x0, x20                 ; pSrc
-    mov     x1, x19                 ; pMod
-
-    mov     x10, x7                 ; x10 = hc
-    mov     x3, x17                 ; Restore the digit counter
-    subs    x4, x4, x4              ; Set the carry flag (i.e. no borrow)
-
-SymCryptFdefMontgomeryReduceRawSubAsmLoop
-    add     x3, x3, #1              ; Increment the digit count by one
-    ; borrow is in the carry flag (flipped)
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-
-    ldp     x4, x6, [x0], #16       ; Load two words of pSrc1
-    ldp     x5, x7, [x1], #16       ; Load two words of pSrc2
-    sbcs    x4, x4, x5
-    sbcs    x6, x6, x7
-    stp     x4, x6, [x2], #16       ; Store the result in the destination
-
-    cbnz    x3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
-
-    csetcc  x0                      ; If the carry is clear (borrow), set the return value to 1
-
-    orr     x11, x10, x0            ; x11 = hc|d
-
-    ; Prepare the pointers for masked copy
-    mov     x0, x20                 ; pSrc
-    mov     x1, x16                 ; pDst
-
-    mov     x2, x17                 ; Restore the digit counter
-    subs    x4, x10, x11            ; If (x11 > x10) clear the carry flag (i.e. borrow)
-
-SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
-    add     x2, x2, #1              ; Increment the digit count by one
-
-    ldp     x4, x6, [x0], #16       ; Load two words of the source
-    ldp     x5, x7, [x1]            ; Load two words of the destination
-    cselcc  x4, x4, x5              ; If the carry is clear, select the source operands
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16       ; Store the two words in the destination
-
-    ldp     x4, x6, [x0], #16
-    ldp     x5, x7, [x1]
-    cselcc  x4, x4, x5
-    cselcc  x6, x6, x7
-    stp     x4, x6, [x1], #16
-
-    cbnz    x2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
-
-    ; Done, no return value
-
-    EPILOG_RESTORE_REG_PAIR x19, x20, #16
-    EPILOG_RESTORE_REG_PAIR fp, lr, #32!
-    EPILOG_RETURN
-
-    NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
-
-    END
-
--- a/lib/arm64/fdef_asm.symcryptasm
+++ b/lib/arm64/fdef_asm.symcryptasm
@ -0,0 +1,705 @@
+//
+//  fdef_asm.symcryptasm   Assembler code for large integer arithmetic in the default data format for the arm64 architecture
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+#include "symcryptasm_shared.cppasm"
+
+// A digit consists of 4 words of 64 bits each
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdefRawAddAsm(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm), 4, 8)
+
+    ldp     X_4, X_6, [X_0]         // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1]         // Load two words of pSrc2
+    adds    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2]         // Store the result in the destination
+
+    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    adcs    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
+
+    cbz     X_3, SymCryptFdefRawAddAsmEnd
+
+LABEL(SymCryptFdefRawAddAsmLoop)
+    // carry is in the carry flag
+    // only update pointers to srcs and destination once per loop to reduce uops and dependencies
+    ldp     X_4, X_6, [X_0, #32]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #32]!   // Load two words of pSrc2
+    adcs    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #32]!   // Store the result in the destination
+
+    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    adcs    X_4, X_4, X_5
+    adcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdefRawAddAsmLoop
+
+    ALIGN(4)
+LABEL(SymCryptFdefRawAddAsmEnd)
+    cset    X_0, cs                 // Set the return value equal to the carry
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm))
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdefRawSubAsm(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                            UINT32      nDigits )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm), 4, 8)
+
+    ldp     X_4, X_6, [X_0]         // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1]         // Load two words of pSrc2
+    subs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2]         // Store the result in the destination
+
+    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
+
+    cbz     X_3, SymCryptFdefRawSubAsmEnd
+
+LABEL(SymCryptFdefRawSubAsmLoop)
+    // borrow is in the carry flag (flipped)
+    // only update pointers to srcs and destination once per loop to reduce uops and dependencies
+    ldp     X_4, X_6, [X_0, #32]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #32]!   // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #32]!   // Store the result in the destination
+
+    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdefRawSubAsmLoop
+
+    ALIGN(4)
+LABEL(SymCryptFdefRawSubAsmEnd)
+    cset    X_0, cc                 // If the carry is clear (borrow), set the return value to 1
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefMaskedCopyAsm(
+//    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
+//    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
+//                                                                UINT32      nDigits,
+//                                                                UINT32      mask )
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm), 4, 4)
+
+    dup     v0.4s, W_3              // broadcast the mask to v0
+
+LABEL(SymCryptFdefMaskedCopyAsmLoop)
+    ldp     q1, q3, [X_0], #32      // Load 4 words of the source
+    ldp     q2, q4, [X_1]           // Load 4 words of the destination
+    bit     v2.16b, v1.16b, v0.16b  // if the mask is 1s, overwrite the destination with source
+    bit     v4.16b, v3.16b, v0.16b  // if the mask is 1s, overwrite the destination with source
+    stp     q2, q4, [X_1], #32      // Store the two words in the destination
+
+    sub     X_2, X_2, #1            // Decrement the digit count by one
+
+    cbnz    X_2, SymCryptFdefMaskedCopyAsmLoop
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMulAsm(
+//    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc1,
+//                                                                    UINT32      nDigits1,
+//    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)                PCUINT32    pSrc2,
+//                                                                    UINT32      nDigits2,
+//    _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32)   PUINT32     pDst )
+//
+// Basic structure:
+//   for each word in Src1:
+//       Dst += Src2 * word
+//
+// Register assignments
+//       X_0  = pSrc1 (moving forward one word every outer loop)
+//       X_1  = word count of pSrc1
+//       X_2  = pSrc2 (moving forward one *digit* every inner loop)
+//       X_3  = digit count of pSrc2 and pDst
+//       X_4  = pDst (moving forward one *digit* every inner loop)
+//       X_5  = Stored pDst (moving forward one word every outer loop)
+//       X_6  = Current word loaded from pSrc1
+//       X_7, X_8   = Current words loaded in pairs from pSrc2
+//       X_9, X_10  = Current words loaded in pairs from pDst
+//       X_11, X_12 = Scratch registers for holding the results of multiplies
+//       X_13 = Stored pSrc2
+//       X_14 = Stored digit count of pSrc2
+//       X_15 = Scratch register for holding the results of multiplies
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm), 5, 16)
+
+    lsl     X_1, X_1, #2                // Calculate word count
+
+    sub     X_2, X_2, #32               // offset pSrc2 so we can use pre-increment form of loads
+    sub     X_4, X_4, #32               // offset pDst so we can use pre-increment form of loads
+
+    mov     X_5, X_4                    // store pDst
+    mov     X_13, X_2                   // store pSrc2
+    mov     X_14, X_3                   // store nDigits2 for later
+
+    //
+    // First iteration of main loop (no adding of previous values from pDst)
+    //
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0]                  // load the first word from pSrc1
+
+LABEL(SymCryptFdefRawMulAsmLoopInner1)
+    sub     X_3, X_3, #1                // move one digit up
+
+    ldp     X_7, X_8, [X_2, #32]!       // load 2 words from pSrc2
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[0]*pSrc2[j]
+    adcs    X_11, X_11, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_7              // Bits <127:64> of pSrc1[0]*pSrc2[j]
+
+    mul     X_15, X_6, X_8              // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
+    adcs    X_15, X_15, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
+
+    stp     X_11, X_15, [X_4, #32]!     // Store to destination
+    ldp     X_7, X_8, [X_2, #16]        // load 2 words from pSrc2
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
+    adcs    X_11, X_11, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_7              // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
+
+    mul     X_15, X_6, X_8              // Bits <63:0> of pSrc1[0]*pSrc2[j+3]
+    adcs    X_15, X_15, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[0]*pSrc2[j+3]
+
+    stp     X_11, X_15, [X_4, #16]      // Store to destination
+
+    cbnz    X_3, SymCryptFdefRawMulAsmLoopInner1
+
+    adc     X_12, X_12, xzr             // Store the next word into the destination (with the carry if any)
+    str     X_12, [X_4, #32]
+
+    sub     X_1, X_1, #1                // move one word up
+    add     X_0, X_0, #8                // move start of pSrc1 one word up
+    add     X_5, X_5, #8                // move start of pDst one word up
+
+    //
+    // MAIN LOOP
+    //
+LABEL(SymCryptFdefRawMulAsmLoopOuter)
+    mov     X_3, X_14                   // set nDigits2
+    mov     X_2, X_13                   // set pSrc2
+    mov     X_4, X_5                    // set pDst
+
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0]                  // load the next word from pSrc1
+
+LABEL(SymCryptFdefRawMulAsmLoopInner)
+    sub     X_3, X_3, #1                // move one digit up
+
+    ldp     X_7, X_8, [X_2, #32]!       // load 2 words from pSrc2
+    ldp     X_9, X_10, [X_4, #32]!      // load 2 words from pDst
+
+    adcs    X_9, X_9, X_12              // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_11, X_6, X_7              // Bits <127:64> of pSrc1[i]*pSrc2[j]
+    adcs    X_10, X_11, X_10            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
+    adc     X_12, X_12, xzr             // Add the carry if any and don't update the flags
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[i]*pSrc2[j]
+    adds    X_9, X_9, X_11              // add the word from the destination and update the flags (this can overflow)
+    mul     X_11, X_6, X_8              // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
+    adcs    X_10, X_10, X_11            // add the word from the destination and update the flags (this can overflow)
+
+    stp     X_9, X_10, [X_4]            // Store to destination
+
+    ldp     X_7, X_8, [X_2, #16]        // load 2 words from pSrc2
+    ldp     X_9, X_10, [X_4, #16]       // load 2 words from pDst
+
+    adcs    X_9, X_9, X_12              // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_11, X_6, X_7              // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
+    adcs    X_10, X_11, X_10            // Adding the previous word (if there was a carry from the last addition it is added)
+    umulh   X_12, X_6, X_8              // Bits <127:64> of pSrc1[i]*pSrc2[j+3]
+    adc     X_12, X_12, xzr             // Add the carry if any and don't update the flags
+
+    mul     X_11, X_6, X_7              // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
+    adds    X_9, X_9, X_11              // add the word from the destination and update the flags (this can overflow)
+    mul     X_11, X_6, X_8              // Bits <63:0> of pSrc1[i]*pSrc2[j+3]
+    adcs    X_10, X_10, X_11            // add the word from the destination and update the flags (this can overflow)
+
+    stp     X_9, X_10, [X_4, #16]       // Store to destination
+
+    cbnz    X_3, SymCryptFdefRawMulAsmLoopInner
+
+    adc     X_12, X_12, xzr             // Store the next word into the destination (with the carry if any)
+    str     X_12, [X_4, #32]
+
+    subs    X_1, X_1, #1                // move one word up
+    add     X_0, X_0, #8                // move start of pSrc1 one word up
+    add     X_5, X_5, #8                // move start of pDst one word up
+
+    bne     SymCryptFdefRawMulAsmLoopOuter
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm))
+
+    // Macro for the first loop of the first pass of RawSquareAsm.
+    // It takes one word from the source, multiplies it with the mulword,
+    // adds the high level word of the previous macro call, and stores it into
+    // the destination.
+    //
+    // No carry flag is propagated from the previous macro call as the maximum is
+    // (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
+MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1)
+
+    ldr     scratch0, [src_reg, #8*index]   // pSrc[i+j]
+
+    mul     scratch1, mul_word, scratch0    // Bits <63:0> of pSrc[i]*pSrc[i+j]
+    adds    scratch1, scratch1, src_carry   // Adding the previous word
+    umulh   dst_carry, mul_word, scratch0   // Bits <127:64> of pSrc[i]*pSrc[i+j]
+    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
+
+    str     scratch1, [dst_reg, #8*index]   // Store to destination
+
+MACRO_END()
+
+    // Macro for the remaining loops of the first pass of RawSquareAsm.
+    // The only difference to the above is that it also adds the word loaded
+    // from the destination buffer.
+    //
+    // No carry flag is propagated from the previous macro call as the maximum is
+    // (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
+MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1, scratch2)
+
+    ldr     scratch0, [src_reg, #8*index]   // pSrc[i+j]
+    ldr     scratch2, [dst_reg, #8*index]   // pDst[2*(i+j)]
+
+    mul     scratch1, mul_word, scratch0    // Bits <63:0> of pSrc[i]*pSrc[i+j]
+    adds    scratch1, scratch1, src_carry   // Adding the previous word
+    umulh   dst_carry, mul_word, scratch0   // Bits <127:64> of pSrc[i]*pSrc[i+j]
+    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
+
+    adds    scratch1, scratch1, scratch2    // Add the word from the destination
+    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
+
+    str     scratch1, [dst_reg, #8*index]   // Store to destination
+
+MACRO_END()
+
+    // Macro for the third pass loop of RawSquareAsm.
+    // It takes one mulword from the source, squares it, and
+    // adds it to the even columns of the destination. The carries are propagated
+    // to the odd columns.
+    //
+    // Here we can have a (1-bit) carry to the next call because the maximum value for
+    // a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
+MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, squarelo, squarehi, scratch0, scratch1)
+
+    ldr     squarehi, [src_reg, #8*index]               // mulword
+    mul     squarelo, squarehi, squarehi                // Bits <63:0> of m^2
+    umulh   squarehi, squarehi, squarehi                // Bits <127:64> of m^2
+
+    ldp     scratch0, scratch1, [dst_reg, #16*index]    // Load
+
+    // Adding the square to the even column
+    adcs    squarelo, squarelo, scratch0                // carry from previous and update the flags
+
+    // Propagating the sum to the next column
+    adcs    squarehi, squarehi, scratch1                // This can generate a carry
+
+    stp     squarelo, squarehi, [dst_reg, #16*index]    // Store
+
+MACRO_END()
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawSquareAsm(
+//    _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32)     PCUINT32    pSrc,
+//                                                        UINT32      nDigits,
+//    _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32     pDst )
+//
+// Register assignments
+//       X_0  = pSrc
+//       X_1  = word count of pSrc
+//       X_2  = pSrc (moving forward one digit / 4 words every inner loop)
+//       X_3  = digit count of pSrc
+//       X_4  = pDst (moving forward one digit every inner loop)
+//       X_5  = pDst (moving forward one word every outer loop)
+//       X_6  = Current word loaded from pSrc
+//       X_7, X_8   = Current words loaded in pairs from pSrc2
+//       X_9, X_10  = Current words loaded in pairs from pDst
+//       X_11, X_12 = "128-bit" sliding register to hold the result of multiplies
+//       X_13 = Stored pSrc
+//       X_14 = Digit count of pSrc
+//       X_15 = Stored digit count of pSrc
+//       X_16 = Stored pDst
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
+
+    mov     X_3, X_1                    // digit count into X_3
+
+    lsl     X_1, X_1, #2                // Calculate word count
+
+    mov     X_4, X_2                    // pDst
+    mov     X_5, X_2                    // store pDst
+    mov     X_16, X_2                   // store pDst
+    mov     X_13, X_0                   // store pSrc
+    mov     X_2, X_0                    // inner loop pSrc
+    mov     X_14, X_3                   // store nDigits for later
+    mov     X_15, X_3                   // store nDigits for later
+
+    //
+    // First iteration of main loop (no adding of previous values from pDst)
+    //
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0]                  // load the first word from pSrc1
+    str     X_12, [X_4]                 // store 0 for the first word
+
+    b       SymCryptFdefRawSquareAsmInnerLoopInit_Word1
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word0)
+    SQR_SINGLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_8
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word1)
+    SQR_SINGLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_8
+
+    SQR_SINGLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_8
+
+    SQR_SINGLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_8
+
+    sub     X_3, X_3, #1                // move one digit up
+    add     X_2, X_2, #32
+    add     X_4, X_4, #32
+
+    cbnz    X_3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
+
+    str     X_12, [X_4]                 // Store the next word into the destination
+
+    sub     X_1, X_1, #2                // move two words up (we started at the word 1)
+
+    mov     X_8, #1                     // Cyclic counter
+
+    //
+    // MAIN LOOP
+    //
+LABEL(SymCryptFdefRawSquareAsmOuterLoop)
+
+    add     X_5, X_5, #8                // move start of pDst one word up
+
+    mov     X_3, X_14                   // set nDigits
+    mov     X_2, X_0                    // set pSrc
+    mov     X_4, X_5                    // set pDst
+
+    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    ldr     X_6, [X_0, X_8, LSL #3]     // load the next word from pSrc
+
+    // Cyclic counter and jump logic
+    add     X_8, X_8, #1
+    cmp     X_8, #1
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word1
+    cmp     X_8, #2
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word2
+    cmp     X_8, #3
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word3
+
+    // The following instructions are only executed when X_8 == 4
+    mov     X_8, xzr                // Set it to 0
+
+    add     X_0, X_0, #32           // move start of pSrc 4 words up
+    add     X_5, X_5, #32           // move pDst 4 words up
+
+    mov     X_2, X_0                // set pSrc
+    mov     X_4, X_5                // set pDst
+
+    sub     X_14, X_14, #1          // remove 1 digit
+    mov     X_3, X_14               // set the new digit counter
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
+    SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
+    SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
+    SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+
+LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
+    SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+
+    sub     X_3, X_3, #1                // move one digit up
+    add     X_2, X_2, #32
+    add     X_4, X_4, #32
+
+    cbnz    X_3, SymCryptFdefRawSquareAsmInnerLoop_Word0
+
+    str     X_12, [X_4]                 // Store the next word into the destination
+
+    sub     X_1, X_1, #1                // move one word up
+    cbnz    X_1, SymCryptFdefRawSquareAsmOuterLoop
+
+    ands    X_12, X_12, xzr             // Setting X_12 = 0
+    str     X_12, [X_5, #40]            // Store 0 to destination for the top word
+
+    ////////////////////////////////////////////////////////////////
+    // Second Pass - Shifting all results 1 bit left
+    ////////////////////////////////////////////////////////////////
+
+    mov     X_3, X_15       // nDigits
+    lsl     X_3, X_3, #1    // Double digits
+    mov     X_4, X_16       // pDst pointer
+    ands    X_7, X_7, xzr   // Clear the flags
+
+LABEL(SymCryptFdefRawSquareAsmSecondPass)
+
+    sub     X_3, X_3, #1    // move one digit up
+
+    ldp     X_7, X_8, [X_4]
+    adcs    X_7, X_7, X_7   // Shift left and add the carry
+    adcs    X_8, X_8, X_8
+    stp     X_7, X_8, [X_4], #16
+
+    ldp     X_9, X_10, [X_4]
+    adcs    X_9, X_9, X_9   // Shift left and add the carry
+    adcs    X_10, X_10, X_10
+    stp     X_9, X_10, [X_4], #16
+
+    cbnz    X_3, SymCryptFdefRawSquareAsmSecondPass
+
+    //////////////////////////////////////////////////////////////////////////////
+    // Third Pass - Adding the squares on the even columns and propagating the sum
+    //////////////////////////////////////////////////////////////////////////////
+
+    ands    X_7, X_7, xzr   // Clear the flags
+    mov     X_0, X_13       // src pointer
+    mov     X_4, X_16       // pDst pointer
+    mov     X_3, X_15       // nDigits
+
+LABEL(SymCryptFdefRawSquareAsmThirdPass)
+    SQR_DIAGONAL_PROP 0, X_0, X_4, X_6, X_7, X_8, X_9
+    SQR_DIAGONAL_PROP 1, X_0, X_4, X_6, X_7, X_8, X_9
+    SQR_DIAGONAL_PROP 2, X_0, X_4, X_6, X_7, X_8, X_9
+    SQR_DIAGONAL_PROP 3, X_0, X_4, X_6, X_7, X_8, X_9
+
+    sub     X_3, X_3, #1        // move one digit up
+    add     X_0, X_0, #32       // One digit up (not updated in SQR_DIAGONAL_PROP)
+    add     X_4, X_4, #64       // Two digits up (not updated in SQR_DIAGONAL_PROP)
+
+    cbnz    X_3, SymCryptFdefRawSquareAsmThirdPass
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefMontgomeryReduceAsm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _Inout_                         PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+//
+// Register assignments
+//       X_0  = pMod (moving forward one *digit* every inner loop)
+//       X_1  = pSrc (moving forward one *digit* every inner loop)
+//       X_2  = pDst (used only in the end for subtract / result)
+//       X_3  = digit count of pSrc and pMod
+//       X_4  = word count of pSrc
+//       X_5  = Inv64 of the modulus
+//       X_6  = m = pSrc[i]*Inv64
+//       X_7  = hc = high carry variable
+//       X_8, X_9   = Current words loaded in pairs from pSrc
+//       X_10, X_11 = Current words loaded in pairs from pMod
+//       X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
+//                  It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
+//       X_14 = Temporary intermediate result
+//       X_15 = Stored digit count of pSrc
+//       X_16 = Stored pMod pointer
+//       X_17 = Stored pSrc pointer (moving forward one word every outer loop)
+
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
+
+    ldr     W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64]          // # of Digits
+    ldr     X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64]  // Inv64 of modulus
+    add     X_0, X_0, #SymCryptModulusValueOffsetArm64              // pMod
+
+    lsl     X_4, X_3, #2                // Multiply by 4 to get the number of words
+
+    sub     X_0, X_0, #32               // offset pMod so we can use pre-increment form of loads
+    sub     X_1, X_1, #32               // offset pSrc so we can use pre-increment form of loads
+    sub     X_2, X_2, #32               // offset pDst so we can use pre-increment form of loads
+
+    mov     X_15, X_3                   // Store the digit count for later
+    mov     X_16, X_0                   // Store the pMod pointer
+    mov     X_17, X_1                   // Store the pSrc pointer
+
+    and     X_7, X_7, xzr               // Set hc to 0
+
+    //
+    // Main loop
+    //
+LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
+    ldr     X_8, [X_1, #32]             // Load 1 word from pSrc
+    mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
+
+    and     X_12, X_12, xzr             // Set c to 0
+
+LABEL(SymCryptFdefMontgomeryReduceAsmInner)
+    ldp     X_10, X_11, [X_0, #32]!     // pMod[j]
+    ldp     X_8, X_9, [X_1, #32]!       // pSrc[j]
+
+    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adds    X_12, X_12, X_14            // Add the lower bits of c
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    // ***: These cannot produce extra carry as the maximum is
+    //      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
+    str     X_12, [X_1]                 // pSrc[j] = (UINT64) c
+
+    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_9             // Adding pSrc[j]
+    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
+    adc     X_12, X_12, xzr             // Add the carry if any (***)
+    adds    X_13, X_13, X_14            // Add the lower bits of c
+    adc     X_12, X_12, xzr             // Add the carry if any (***)
+    str     X_13, [X_1, #8]             // pSrc[j] = (UINT64) c
+
+    ldp     X_10, X_11, [X_0, #16]      // pMod[j]
+    ldp     X_8, X_9, [X_1, #16]        // pSrc[j]
+
+    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adds    X_12, X_12, X_14            // Add the lower bits of c
+    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    str     X_12, [X_1, #16]            // pSrc[j] = (UINT64) c
+
+    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
+    adds    X_14, X_14, X_9             // Adding pSrc[j]
+    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
+    adc     X_12, X_12, xzr             // Add the carry if any (***)
+    adds    X_13, X_13, X_14            // Add the lower bits of c
+    adc     X_12, X_12, xzr             // Add the carry if any (***)
+    str     X_13, [X_1, #24]            // pSrc[j] = (UINT64) c
+
+    subs    X_3, X_3, #1                // Move one digit up
+    bne     SymCryptFdefMontgomeryReduceAsmInner
+
+    ldr     X_8, [X_1, #32]             // pSrc[nWords]
+    adds    X_12, X_12, X_8             // c + pSrc[nWords]
+    adc     X_13, xzr, xzr              // Add the carry if any
+
+    adds    X_12, X_12, X_7             // c + pSrc[nWords] + hc
+    adc     X_7, X_13, xzr              // Add the carry if any and store into hc
+
+    str     X_12, [X_1, #32]            // pSrc[nWords] = c
+
+    subs    X_4, X_4, #1                // Move one word up
+
+    add     X_17, X_17, #8              // Move stored pSrc pointer one word up
+    mov     X_0, X_16                   // Restore pMod pointer
+    mov     X_1, X_17                   // Restore pSrc pointer
+
+    mov     X_3, X_15                   // Restore the digit counter
+
+    bne     SymCryptFdefMontgomeryReduceAsmOuter
+
+    //
+    // Subtraction
+    //
+
+    mov     X_14, X_2               // Store pDst pointer
+
+    // Prepare the pointers for subtract
+    mov     X_0, X_17               // pSrc
+    mov     X_1, X_16               // pMod
+
+    mov     X_10, X_7               // X_10 = hc
+    mov     X_3, X_15               // Restore the digit counter
+    subs    X_4, X_4, X_4           // Set the carry flag (i.e. no borrow)
+
+LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
+    sub     X_3, X_3, #1            // Decrement the digit count by one
+    // borrow is in the carry flag (flipped)
+
+    ldp     X_4, X_6, [X_0, #32]!   // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #32]!   // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #32]!   // Store the result in the destination
+
+    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    sbcs    X_4, X_4, X_5
+    sbcs    X_6, X_6, X_7
+    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
+
+    cbnz    X_3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
+
+    cset    X_0, cc                 // If the carry is clear (borrow), set the return value to 1
+
+    orr     X_11, X_10, X_0         // X_11 = hc|d
+
+    // Prepare the pointers for masked copy
+    mov     X_0, X_17               // pSrc
+    mov     X_1, X_14               // pDst
+
+    mov     X_2, X_15               // Restore the digit counter
+    subs    X_4, X_10, X_11         // If (X_11 > X_10) clear the carry flag (i.e. borrow)
+
+LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
+    sub     X_2, X_2, #1            // decrement the digit count by one
+
+    ldp     X_4, X_6, [X_0, #32]!   // Load two words of the source
+    ldp     X_5, X_7, [X_1, #32]!   // Load two words of the destination
+    csel    X_4, X_4, X_5, cc       // If the carry is clear, select the source operands
+    csel    X_6, X_6, X_7, cc
+    stp     X_4, X_6, [X_1]         // Store the two words in the destination
+
+    ldp     X_4, X_6, [X_0, #16]
+    ldp     X_5, X_7, [X_1, #16]
+    csel    X_4, X_4, X_5, cc
+    csel    X_6, X_6, X_7, cc
+    stp     X_4, X_6, [X_1, #16]
+
+    cbnz    X_2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
+
+    // Done, no return value
+
+FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm))
+
+    FILE_END()
--- a/lib/arm64/symcrypt_magic.inc
+++ b/lib/arm64/symcrypt_magic.inc
@ -1,28 +0,0 @@
-;
-; SymCrypt_magic.inc
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-; Include file to define the support macros for the Magic field
-;
-
-        IMPORT  ARM64EC_NAME_MANGLE(SymCryptFatal)
-
-#define SYMCRYPT_CODE_VERSION   (SYMCRYPT_CODE_VERSION_API * 65536 + SYMCRYPT_CODE_VERSION_MINOR)
-#define SYMCRYPT_MAGIC_CONSTANT (0x53316d76 + SYMCRYPT_CODE_VERSION)
-
-        MACRO
-        SYMCRYPT_CHECK_MAGIC $temp1, $temp2, $ptr, $offset
-
-#if SYMCRYPT_DEBUG
-
-        ldr     $temp1, [$ptr, #$offset]
-        subs    $temp1, $temp1, $ptr
-        mov32   $temp2, SYMCRYPT_MAGIC_CONSTANT
-        cmp     $temp1, $temp2
-        beq     %F1
-        mov32   r0, 0x6d616763     ; 'magc'
-        bl      ARM64EC_NAME_MANGLE(SymCryptFatal)
-1
-#endif
-
-        MEND
--- a/lib/arm64/wipe.asm
+++ b/lib/arm64/wipe.asm
@ -1,37 +0,0 @@
-        TTL  "SymCryptWipe"
-;++
-;
-; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-;
-; Secure wipe
-;
-;--
-
-#include "ksarm64.h"
-#include "symcrypt_name_mangling.inc"
-
-        TEXTAREA
-
-        EXTERN  ARM64EC_NAME_MANGLE(memset)
-
-        SUBT  "SymCryptWipe"
-;VOID
-;SYMCRYPT_CALL
-;SymCryptWipe( _Out_writes_bytes_( cbData )   PVOID  pbData,
-;                                       SIZE_T cbData )
-
-
-        LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
-
-        ; we just jump to memset.
-        ; this is enough to stop the compiler optimizing the memset away.
-
-        mov     x2, x1
-        mov     x1, #0
-        b       ARM64EC_NAME_MANGLE(memset)
-
-        LEAF_END ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
-
-
-
-        END
--- a/lib/arm64/wipe.symcryptasm
+++ b/lib/arm64/wipe.symcryptasm
@ -0,0 +1,31 @@
+//
+//  wipe.symcryptasm   Assembler code for wiping a buffer
+//  Expresses asm in a generic enough way to enable generation of MASM and GAS using the
+//  symcryptasm_processor.py script and C preprocessor
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+
+#include "symcryptasm_shared.cppasm"
+
+    TEXTAREA()
+
+    EXTERN(ARM64EC_NAME_MANGLE(memset))
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptWipe( _Out_writes_bytes_( cbData )    PVOID  pbData,
+//                                              SIZE_T cbData )
+
+    FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptWipeAsm), 2, 3)
+
+// we just jump to memset.
+// this is enough to stop the compiler optimizing the memset away.
+
+    mov     X_2, X_1
+    mov     X_1, #0
+    b       ARM64EC_NAME_MANGLE(memset)
+
+    FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptWipeAsm))
+
+    FILE_END()
--- a/lib/arm64ec/fdef369_asm.asm
+++ b/lib/arm64ec/fdef369_asm.asm
@ -1 +0,0 @@
-#include "..\arm64\fdef369_asm.asm"
--- a/lib/arm64ec/fdef_asm.asm
+++ b/lib/arm64ec/fdef_asm.asm
@ -1 +0,0 @@
-#include "..\arm64\fdef_asm.asm"
--- a/lib/arm64ec/symcrypt_magic.inc
+++ b/lib/arm64ec/symcrypt_magic.inc
@ -1 +0,0 @@
-#include "..\arm64\symcrypt_magic.inc"
--- a/lib/arm64ec/symcrypt_name_mangling.inc
+++ b/lib/arm64ec/symcrypt_name_mangling.inc
@ -1 +0,0 @@
-#include "..\arm64\symcrypt_name_mangling.inc"
--- a/lib/arm64ec/wipe.asm
+++ b/lib/arm64ec/wipe.asm
@ -1 +0,0 @@
-#include "..\arm64\wipe.asm"
--- a/lib/linux/asmstubs.c
+++ b/lib/linux/asmstubs.c
@ -1,132 +0,0 @@
-//
-// asmstubs.c
-// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM on Arm64
-//
-// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-//
-
-#include "../precomp.h"
-
-VOID
-SYMCRYPT_CALL
-SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
-{
-    volatile BYTE * p = (volatile BYTE *) pbData;
-    SIZE_T i;
-
-    for( i=0; i<cbData; i++ ){
-        p[i] = 0;
-    }
-
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyC(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMaskedCopyAsm(
-    _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )        PCBYTE      pbSrc,
-    _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE )   PBYTE       pbDst,
-                                                                UINT32      nDigits,
-                                                                UINT32      mask )
-{
-    SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawAddAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubC(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits );
-
-UINT32
-SYMCRYPT_CALL
-SymCryptFdefRawSubAsm(
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
-    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
-    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
-                                                            UINT32      nDigits )
-{
-    return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulC(
-    _In_reads_(nWords1)             PCUINT32    pSrc1,
-                                    UINT32      nDigits1,
-    _In_reads_(nWords2)             PCUINT32    pSrc2,
-                                    UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2) PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawMulAsm(
-    _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32)   PCUINT32    pSrc1,
-                                                        UINT32      nDigits1,
-    _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc2,
-                                                        UINT32      nDigits2,
-    _Out_writes_(nWords1 + nWords2)                     PUINT32     pDst )
-{
-    SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
-}
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareC(
-    _In_reads_(nWords)              PCUINT32    pSrc,
-                                    UINT32      nDigits,
-    _Out_writes_(2*nWords)          PUINT32     pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefRawSquareAsm(
-    _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
-                                                        UINT32      nDigits,
-    _Out_writes_(2*nWords)                              PUINT32     pDst )
-{
-    SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
-}
-
-VOID
-SymCryptFdefMontgomeryReduceC(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst );
-
-VOID
-SYMCRYPT_CALL
-SymCryptFdefMontgomeryReduceAsm(
-    _In_                            PCSYMCRYPT_MODULUS      pmMod,
-    _In_                            PUINT32                 pSrc,
-    _Out_                           PUINT32                 pDst )
-{
-    SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
-}
--- a/lib/makefile.inc
+++ b/lib/makefile.inc
@ -4,9 +4,18 @@

 # Preprocess amd64 .symcryptasm into masm
 {amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
-    ..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
+    ..\scripts\symcryptasm_processor.py masm amd64 msft $< $(OBJ_PATH)\$(O)\$(<B).cppasm
    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm

 # Preprocess x86 .cppasm into masm
 {i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
    $(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
+
+
+# Preprocess arm64 .symcryptasm into masm
+{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\}.asm:
+    ..\scripts\symcryptasm_processor.py armasm64 arm64 aapcs64 $< $(OBJ_PATH)\$(O)\$(<B).asm
+
+# Preprocess arm64ec .symcryptasm into masm
+{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\arm64ec\}.asm:
+    ..\scripts\symcryptasm_processor.py armasm64 arm64 arm64ec $< $(OBJ_PATH)\$(O)\arm64ec\$(<B).asm
--- a/lib/sources
+++ b/lib/sources
@ -18,8 +18,13 @@ GUARD = 1       # enable CFG
 ENABLE_ASM_RETPOLINE = 1
 ENABLE_RETPOLINE_LINKER_WARNING = 1

-# Enable /Gy for all assembler code
-ASM_DEFINES=$(ASM_DEFINES) /Gy
+# Enable /Gy for all assembler code, and some additional symcryptasm definitions for Arm64 assembler code
+ASM_DEFINES=\
+!IF "$(_BUILDARCH)" == "arm64"
+    $(ASM_DEFINES) /Gy /DSYMCRYPT_CPU_ARM64 /DSYMCRYPT_MASM
+!ELSE
+    $(ASM_DEFINES) /Gy
+!ENDIF

 USE_MAKEFILE_INC = 1

@ -35,6 +40,15 @@ NTTARGETFILE0=\
 !ELSEIF "$(_BUILDARCH)" == "x86"
    $(OBJ_PATH)\$(O)\..\i386\aesasm.asm \
    $(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
+!ELSEIF "$(_BUILDARCH)" == "arm64"
+    $(OBJ_PATH)\$(O)\..\arm64\fdef_asm.asm \
+    $(OBJ_PATH)\$(O)\..\arm64\fdef369_asm.asm \
+    $(OBJ_PATH)\$(O)\..\arm64\wipe.asm \
+!IF "$(ARM64X_EC_ENABLED)" == "1"
+    $(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef_asm.asm \
+    $(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef369_asm.asm \
+    $(OBJ_PATH)\$(O)\..\arm64\arm64ec\wipe.asm \
+!ENDIF
 !ENDIF

 INCLUDES=  \
--- a/lib/symcryptasm_shared.cppasm
+++ b/lib/symcryptasm_shared.cppasm
@ -9,6 +9,10 @@

 #if defined(SYMCRYPT_CPU_AMD64)
 include ksamd64.inc
+#elif defined(SYMCRYPT_CPU_ARM64)
+#include "ksarm64.h"
+#include "arm64/symcrypt_name_mangling.inc"
+#include "symcrypt_version.inc"
 #endif

 #include "C_asm_shared.inc"
@ -17,17 +21,29 @@ include ksamd64.inc
 #define ALIGN(__alignment)              align __alignment
 #define GET_SYMBOL_ADDRESS(__symbol)    __symbol
 #define HEX(__constant)                 __constant##h
+#define TEXTAREA()                      TEXTAREA
+#define EXTERN(__label)                 EXTERN __label
+#define LABEL(__labelname)              __labelname

 #elif defined(SYMCRYPT_GAS)

+#if defined(SYMCRYPT_CPU_AMD64)
 .intel_syntax noprefix
+#elif defined(SYMCRYPT_CPU_ARM64)
+#include "arm64/symcrypt_name_mangling.inc"
+#include "symcrypt_version.inc"
+#endif

 #include "C_asm_shared.inc"

+
 #define FILE_END()
 #define ALIGN(__alignment)              .align __alignment
 #define GET_SYMBOL_ADDRESS(__symbol)    __symbol+rip
 #define HEX(__constant)                 0x##__constant
+#define TEXTAREA()
+#define EXTERN(__label)
+#define LABEL(__labelname)              __labelname:

 #else

--- a/scripts/symcryptasm_processor.py
+++ b/scripts/symcryptasm_processor.py
@ -5,20 +5,26 @@ environments without requiring forking or duplication of source files - symcrypt
 assembly in an assembler and environment agnostic way.

 The current target assemblers are:
-    MASM and GAS
+    MASM, GAS, and armasm64 (Arm64 assembler which ships with MSVC)
 The current target environments are:
-    amd64 Windows (using the Microsoft x64 calling convention), and
-    amd64 Linux (using the SystemV amd64 calling convention)
+    amd64 Windows (using the Microsoft x64 calling convention),
+    amd64 Linux (using the SystemV amd64 calling convention),
+    arm64 Windows (using the aapcs64 calling convention),
+    arm64 Windows (using the arm64ec calling convention), and
+    arm64 Linux (using the aapcs64 calling convention)

-Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
-rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
-this effort.

-The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
-symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
+The plan is to rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as
+appropriate to enable this effort.
+
+Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
+this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
+file.
 The .cppasm files are further processed by the C preprocessor to do more simple stateless text
 substitutions, outputting a .asm file which can be assembled by the target assembler for the target
 environment.
+The exception is when using the armasm64 assembler, which uses the C preprocessor before assembling
+its inputs already; so the output of this script is directly assembled by armasm64.

 We have set up the intermediate generated files to be created in the output directories in both
 razzle and CMake builds.
@ -42,6 +48,7 @@ FUNCTION_START macro which currently takes 3 arguments:
    These arguments will be accessible in some contiguous region of the symcrypt registers at the
    start of the function
        On amd64 this contiguous region is R1..R<arg_count>
+        On arm64 this contiguous region is R0..R<arg_count-1>
    Note: arg_count need not correspond to the exact number of argument in the function declaration
    if the assembly does not use some tail of the arguments
 3) The number of registers (reg_count) that the function uses
@ -58,6 +65,7 @@ At the function end an epilogue is generated with restores the non-volatile regi
 A nested function (a function which does call another function) is specified similarly, only using
 NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and aligns
 the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
+Nested functions are not currently supported for Arm64.


 A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
@ -82,6 +90,15 @@ and QH. As rdx is used to pass arguments, its value is moved to another register
 prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
    We currently do not support nested mul functions, as we have none of them.

+### arm64 ###
+We allow up to 23 registers to be addressed, with the names:
+X_0-X_22 (64-bit registers) and W_0-W_22 (32-bit registers)
+v0-v7 ASIMD registers may by used directly in assembly too, as in both arm64 calling conventions we
+currently support, these registers are volatile so do not need any special handling
+
+X_0 is always the result register and the first argument passed to the function.
+X_1-X_7 are the arguments 2-8 passed to the function
+
 """

 import re
@ -91,37 +108,71 @@ import logging
 class Register:
    """A class to represent registers"""

-    def __init__(self, name64, name32, name16, name8):
+    def __init__(self, name64, name32, name16=None, name8=None):
        self.name64 = name64
        self.name32 = name32
        self.name16 = name16
        self.name8  = name8

 # amd64 registers
-REG_RAX = Register("rax",  "eax",   "ax",   "al")
-REG_RBX = Register("rbx",  "ebx",   "bx",   "bl")
-REG_RCX = Register("rcx",  "ecx",   "cx",   "cl")
-REG_RDX = Register("rdx",  "edx",   "dx",   "dl")
-REG_RSI = Register("rsi",  "esi",   "si",  "sil")
-REG_RDI = Register("rdi",  "edi",   "di",  "dil")
-REG_RSP = Register("rsp",  "esp",   "sp",  "spl")
-REG_RBP = Register("rbp",  "ebp",   "bp",  "bpl")
-REG_R8  = Register( "r8",  "r8d",  "r8w",  "r8b")
-REG_R9  = Register( "r9",  "r9d",  "r9w",  "r9b")
-REG_R10 = Register("r10", "r10d", "r10w", "r10b")
-REG_R11 = Register("r11", "r11d", "r11w", "r11b")
-REG_R12 = Register("r12", "r12d", "r12w", "r12b")
-REG_R13 = Register("r13", "r13d", "r13w", "r13b")
-REG_R14 = Register("r14", "r14d", "r14w", "r14b")
-REG_R15 = Register("r15", "r15d", "r15w", "r15b")
+AMD64_RAX = Register("rax",  "eax",   "ax",   "al")
+AMD64_RBX = Register("rbx",  "ebx",   "bx",   "bl")
+AMD64_RCX = Register("rcx",  "ecx",   "cx",   "cl")
+AMD64_RDX = Register("rdx",  "edx",   "dx",   "dl")
+AMD64_RSI = Register("rsi",  "esi",   "si",  "sil")
+AMD64_RDI = Register("rdi",  "edi",   "di",  "dil")
+AMD64_RSP = Register("rsp",  "esp",   "sp",  "spl")
+AMD64_RBP = Register("rbp",  "ebp",   "bp",  "bpl")
+AMD64_R8  = Register( "r8",  "r8d",  "r8w",  "r8b")
+AMD64_R9  = Register( "r9",  "r9d",  "r9w",  "r9b")
+AMD64_R10 = Register("r10", "r10d", "r10w", "r10b")
+AMD64_R11 = Register("r11", "r11d", "r11w", "r11b")
+AMD64_R12 = Register("r12", "r12d", "r12w", "r12b")
+AMD64_R13 = Register("r13", "r13d", "r13w", "r13b")
+AMD64_R14 = Register("r14", "r14d", "r14w", "r14b")
+AMD64_R15 = Register("r15", "r15d", "r15w", "r15b")
+
+# arm64 registers
+ARM64_R0  = Register( "x0",  "w0")
+ARM64_R1  = Register( "x1",  "w1")
+ARM64_R2  = Register( "x2",  "w2")
+ARM64_R3  = Register( "x3",  "w3")
+ARM64_R4  = Register( "x4",  "w4")
+ARM64_R5  = Register( "x5",  "w5")
+ARM64_R6  = Register( "x6",  "w6")
+ARM64_R7  = Register( "x7",  "w7")
+ARM64_R8  = Register( "x8",  "w8")
+ARM64_R9  = Register( "x9",  "w9")
+ARM64_R10 = Register("x10", "w10")
+ARM64_R11 = Register("x11", "w11")
+ARM64_R12 = Register("x12", "w12")
+ARM64_R13 = Register("x13", "w13")
+ARM64_R14 = Register("x14", "w14")
+ARM64_R15 = Register("x15", "w15")
+ARM64_R16 = Register("x16", "w16")
+ARM64_R17 = Register("x17", "w17")
+ARM64_R18 = Register("x18", "w18")
+ARM64_R19 = Register("x19", "w19")
+ARM64_R20 = Register("x20", "w20")
+ARM64_R21 = Register("x21", "w21")
+ARM64_R22 = Register("x22", "w22")
+ARM64_R23 = Register("x23", "w23")
+ARM64_R24 = Register("x24", "w24")
+ARM64_R25 = Register("x25", "w25")
+ARM64_R26 = Register("x26", "w26")
+ARM64_R27 = Register("x27", "w27")
+ARM64_R28 = Register("x28", "w28")
+ARM64_R29 = Register("x29", "w29") # Frame Pointer
+ARM64_R30 = Register("x30", "w30") # Link Register

 class CallingConvention:
    """A class to represent calling conventions"""

-    def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
+    def __init__(self, name, architecture, mapping, max_arguments, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
        self.name = name
        self.architecture = architecture
        self.mapping = mapping
+        self.max_arguments = max_arguments
        self.argument_registers = argument_registers
        self.volatile_registers = volatile_registers
        self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
@ -139,9 +190,9 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
    we refer to rdx using (Q|D|W|B)H.
    """
    rdx_index = None
-    return_mapping = { 'H': REG_RDX }
+    return_mapping = { 'H': AMD64_RDX }
    for (index, register) in mapping.items():
-        if register == REG_RDX:
+        if register == AMD64_RDX:
            rdx_index = index
            break
    for (index, register) in mapping.items():
@ -156,28 +207,23 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
            return_mapping[index-1] = register
    return return_mapping

-# Calling convention constants
-
-MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
-MAX_FUNCTION_REGISTER_COUNT = 15
-
 # Microsoft x64 calling convention
 MAPPING_AMD64_MSFT = {
-    0: REG_RAX, # Result register
-    1: REG_RCX, # Argument 1 / volatile
-    2: REG_RDX, # Argument 2 / volatile
-    3: REG_R8,  # Argument 3 / volatile
-    4: REG_R9,  # Argument 4 / volatile
-    5: REG_R10, # volatile
-    6: REG_R11, # volatile
-    7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
-    8: REG_RDI,
-    9: REG_RBP,
-    10:REG_RBX,
-    11:REG_R12,
-    12:REG_R13,
-    13:REG_R14,
-    14:REG_R15,
+    0: AMD64_RAX, # Result register / volatile
+    1: AMD64_RCX, # Argument 1 / volatile
+    2: AMD64_RDX, # Argument 2 / volatile
+    3: AMD64_R8,  # Argument 3 / volatile
+    4: AMD64_R9,  # Argument 4 / volatile
+    5: AMD64_R10, # volatile
+    6: AMD64_R11, # volatile
+    7: AMD64_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
+    8: AMD64_RDI,
+    9: AMD64_RBP,
+    10:AMD64_RBX,
+    11:AMD64_R12,
+    12:AMD64_R13,
+    13:AMD64_R14,
+    14:AMD64_R15,
    # currently not mapping rsp
 }

@ -212,11 +258,11 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=Fal

    prologue += mul_fixup

-    # put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
+    # put additional arguments into Q5-Q6 (we do not support more than 6 arguments for now)
    # stack_offset to get the 5th argument is:
    # 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
    stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
-    for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
+    for i in range(self.argument_registers+1, arg_count+1):
        prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
        stack_offset += 8
    return prologue
@ -247,7 +293,7 @@ def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
 def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
    # only support 4 memory slots for now (in shadow space)
    if(slot >= 4):
-        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
        exit(1)
    # 8B for return address + (8*#pushed registers in prologue)
    stack_offset = 8 + (8*(reg_count-self.volatile_registers))
@ -259,32 +305,32 @@ def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
    return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)

 CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
-    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
    gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
 CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
-    "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
+    "msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 6, 4, 6,
    gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
 CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
-    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
+    "msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
    gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)

 # AMD64 System V calling convention
 MAPPING_AMD64_SYSTEMV = {
-    0: REG_RAX, # Result register
-    1: REG_RDI, # Argument 1 / volatile
-    2: REG_RSI, # Argument 2 / volatile
-    3: REG_RDX, # Argument 3 / volatile
-    4: REG_RCX, # Argument 4 / volatile
-    5: REG_R8,  # Argument 5 / volatile
-    6: REG_R9,  # Argument 6 / volatile
-    7: REG_R10, # volatile
-    8: REG_R11, # volatile
-    9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
-    10:REG_RBP,
-    11:REG_R12,
-    12:REG_R13,
-    13:REG_R14,
-    14:REG_R15
+    0: AMD64_RAX, # Result register / volatile
+    1: AMD64_RDI, # Argument 1 / volatile
+    2: AMD64_RSI, # Argument 2 / volatile
+    3: AMD64_RDX, # Argument 3 / volatile
+    4: AMD64_RCX, # Argument 4 / volatile
+    5: AMD64_R8,  # Argument 5 / volatile
+    6: AMD64_R9,  # Argument 6 / volatile
+    7: AMD64_R10, # volatile
+    8: AMD64_R11, # volatile
+    9: AMD64_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
+    10:AMD64_RBP,
+    11:AMD64_R12,
+    12:AMD64_R13,
+    13:AMD64_R14,
+    14:AMD64_R15
    # currently not mapping rsp
 }

@ -305,7 +351,7 @@ def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=

    prologue += mul_fixup

-    # do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
+    # do not support more than 6 arguments for now
    # # put additional arguments into Q7-Qn
    # # stack_offset to get the 7th argument is:
    # # 8B for return address
@ -341,7 +387,7 @@ def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
 def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
    # only support 4 memory slots for now
    if(slot >= 4):
-        logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
+        logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
        exit(1)
    # For leaf functions, use the top of the redzone below the stack pointer
    offset = -8 * (slot+1)
@ -354,58 +400,230 @@ def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count
    return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)

 CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
-    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
    gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
 CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
-    "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
+    "amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 6, 8,
    gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
 CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
-    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
+    "amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
    gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)


-def gen_function_start_defines(mapping, arg_count, reg_count):
+# ARM64 calling conventions
+MAPPING_ARM64_AAPCS64 = {
+    0: ARM64_R0,  # Argument 1 / Result register / volatile
+    1: ARM64_R1,  # Argument 2 / volatile
+    2: ARM64_R2,  # Argument 3 / volatile
+    3: ARM64_R3,  # Argument 4 / volatile
+    4: ARM64_R4,  # Argument 5 / volatile
+    5: ARM64_R5,  # Argument 6 / volatile
+    6: ARM64_R6,  # Argument 7 / volatile
+    7: ARM64_R7,  # Argument 8 / volatile
+    8: ARM64_R8,  # Indirect result location / volatile
+    9: ARM64_R9,  # volatile
+    10:ARM64_R10, # volatile
+    11:ARM64_R11, # volatile
+    12:ARM64_R12, # volatile
+    13:ARM64_R13, # volatile
+    14:ARM64_R14, # volatile
+    15:ARM64_R15, # volatile
+    # R16 and R17 are intra-procedure-call temporary registers which may be used by the linker
+    # We cannot use these registers for local scratch if we call out to arbitrary procedures, but
+    # currently we only have leaf functions in Arm64 symcryptasm.
+    16:ARM64_R16, # IP0 / volatile
+    17:ARM64_R17, # IP1 / volatile
+    # R18 is a platform register which has a special meaning in kernel mode - we do not use it
+    18:ARM64_R19, # non-volatile
+    19:ARM64_R20, # non-volatile
+    20:ARM64_R21, # non-volatile
+    21:ARM64_R22, # non-volatile
+    22:ARM64_R23, # non-volatile
+    # We could map more registers (R24-R28) but we can only support 23 registers for ARM64EC, and we
+    # don't use this many registers in any symcryptasm yet
+}
+
+MAPPING_ARM64_ARM64ECMSFT = {
+    0: ARM64_R0,  # Argument 1 / Result register / volatile
+    1: ARM64_R1,  # Argument 2 / volatile
+    2: ARM64_R2,  # Argument 3 / volatile
+    3: ARM64_R3,  # Argument 4 / volatile
+    4: ARM64_R4,  # Argument 5 / volatile
+    5: ARM64_R5,  # Argument 6 / volatile
+    6: ARM64_R6,  # Argument 7 / volatile
+    7: ARM64_R7,  # Argument 8 / volatile
+    8: ARM64_R8,  # Indirect result location / volatile
+    9: ARM64_R9,  # volatile
+    10:ARM64_R10, # volatile
+    11:ARM64_R11, # volatile
+    12:ARM64_R12, # volatile
+    # R13 and R14 are reserved in ARM64EC
+    13:ARM64_R15, # volatile
+    14:ARM64_R16, # volatile
+    15:ARM64_R17, # volatile
+    16:ARM64_R19, # non-volatile
+    17:ARM64_R20, # non-volatile
+    18:ARM64_R21, # non-volatile
+    19:ARM64_R22, # non-volatile
+    # R23 and R24 are reserved in ARM64EC
+    20:ARM64_R25, # non-volatile
+    21:ARM64_R26, # non-volatile
+    22:ARM64_R27, # non-volatile
+    # R28 is reserved in ARM64EC
+}
+
+def gen_prologue_aapcs64(self, arg_count, reg_count):
+    prologue = ""
+
+    if reg_count > self.volatile_registers:
+        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
+        exit(1)
+
+    return prologue
+
+def gen_epilogue_aapcs64(self, arg_count, reg_count):
+    epilogue = ""
+
+    if reg_count > self.volatile_registers:
+        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
+        exit(1)
+
+    epilogue += "    ret\n"
+
+    return epilogue
+
+def gen_prologue_arm64ec(self, arg_count, reg_count):
+    prologue = ""
+
+    if reg_count > self.volatile_registers:
+        # Calculate required stack space
+        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
+        registers_to_spill = 2 + reg_count - self.volatile_registers
+        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
+        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
+        prologue += "    PROLOG_SAVE_REG_PAIR fp, lr, #-%d! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
+
+        stack_offset = 16
+        for i in range(self.volatile_registers, reg_count-1, 2):
+            prologue += "    PROLOG_SAVE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
+            stack_offset += 16
+        if registers_to_spill % 2 == 1:
+            prologue += "    PROLOG_SAVE_REG      X_%d, #%d\n" % (reg_count-1, stack_offset)
+
+    return prologue
+
+def gen_epilogue_arm64ec(self, arg_count, reg_count):
+    epilogue = ""
+
+    if reg_count > self.volatile_registers:
+        # Calculate required stack space
+        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
+        registers_to_spill = 2 + reg_count - self.volatile_registers
+        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
+        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
+
+        stack_offset = required_stack_space-16
+        if registers_to_spill % 2 == 1:
+            epilogue += "    EPILOG_RESTORE_REG      X_%d, #%d\n" % (reg_count-1, stack_offset)
+            stack_offset -= 16
+        for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
+            epilogue += "    EPILOG_RESTORE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
+            stack_offset -= 16
+        epilogue += "    EPILOG_RESTORE_REG_PAIR fp, lr, #%d! // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
+        epilogue += "    EPILOG_RETURN\n"
+    else:
+        epilogue += "    ret\n"
+
+    return epilogue
+
+def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, nested=False):
+    logging.error("symcryptasm currently does not support memory slots for arm64!")
+    exit(1)
+
+CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
+    "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
+    gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
+
+CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
+    "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
+    gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
+
+def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
    defines = ""
+    if architecture == "amd64":
+        prefix64 = "Q"
+        prefix32 = "D"
+        prefix16 = "W"
+        prefix8  = "B"
+    elif architecture == "arm64":
+        prefix64 = "X_"
+        prefix32 = "W_"
+    else:
+        logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
+        exit(1)
+
    for (index, reg) in mapping.items():
        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
            continue
-        defines += "#define Q%s %s\n" % (index, reg.name64)
-        defines += "#define D%s %s\n" % (index, reg.name32)
-        defines += "#define W%s %s\n" % (index, reg.name16)
-        defines += "#define B%s %s\n" % (index, reg.name8)
+        if start:
+            if (reg.name64 is not None):
+                defines += "#define %s%s %s\n" % (prefix64, index, reg.name64)
+            if (reg.name32 is not None):
+                defines += "#define %s%s %s\n" % (prefix32, index, reg.name32)
+            if (reg.name16 is not None):
+                defines += "#define %s%s %s\n" % (prefix16, index, reg.name16)
+            if (reg.name8 is not None):
+                defines += "#define %s%s %s\n" % (prefix8,  index, reg.name8)
+        else:
+            if (reg.name64 is not None):
+                defines += "#undef %s%s\n" % (prefix64, index)
+            if (reg.name32 is not None):
+                defines += "#undef %s%s\n" % (prefix32, index)
+            if (reg.name16 is not None):
+                defines += "#undef %s%s\n" % (prefix16, index)
+            if (reg.name8 is not None):
+                defines += "#undef %s%s\n" % (prefix8,  index)
    return defines

-def gen_function_end_defines(mapping, arg_count, reg_count):
-    undefs = ""
-    for (index, _) in mapping.items():
-        if (index != 'H') and (index >= max(arg_count+1, reg_count)):
-            continue
-        undefs += "#undef Q%s\n" % (index)
-        undefs += "#undef D%s\n" % (index)
-        undefs += "#undef W%s\n" % (index)
-        undefs += "#undef B%s\n" % (index)
-    return undefs
+def gen_function_start_defines(architecture, mapping, arg_count, reg_count):
+    return gen_function_defines(architecture, mapping, arg_count, reg_count, start=True)

-MASM_FRAMELESS_FUNCTION_ENTRY   = "LEAF_ENTRY %s, _TEXT\n"
-MASM_FRAMELESS_FUNCTION_END     = "LEAF_END %s, _TEXT\n"
-MASM_FRAME_FUNCTION_ENTRY       = "NESTED_ENTRY %s, _TEXT\n"
-MASM_FRAME_FUNCTION_END         = "NESTED_END %s, _TEXT\n"
+def gen_function_end_defines(architecture, mapping, arg_count, reg_count):
+    return gen_function_defines(architecture, mapping, arg_count, reg_count, start=False)
+
+MASM_FRAMELESS_FUNCTION_ENTRY   = "LEAF_ENTRY %s"
+MASM_FRAMELESS_FUNCTION_END     = "LEAF_END %s"
+MASM_FRAME_FUNCTION_ENTRY       = "NESTED_ENTRY %s"
+MASM_FRAME_FUNCTION_END         = "NESTED_END %s"
+
+# MASM function macros takes the text area as an argument
+MASM_FUNCTION_TEMPLATE      = "%s, _TEXT\n"
+# ARMASM64 function macros must be correctly indented
+ARMASM64_FUNCTION_TEMPLATE  = "    %s\n"

 GAS_FUNCTION_ENTRY    = "%s: .global %s\n"
 GAS_FUNCTION_END      = ""

 def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
    function_entry = None
-    if assembler == "masm":
-        # need to identify and mark up frame functions in masm
+    if assembler in ["masm", "armasm64"]:
+        # need to identify and mark up frame functions in masm and armasm64
        if nested or (reg_count > calling_convention.volatile_registers):
            function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
        else:
            function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
+
+        if assembler == "masm":
+            function_entry = MASM_FUNCTION_TEMPLATE % function_entry
+        elif assembler == "armasm64":
+            function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
    elif assembler == "gas":
        function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
+    else:
+        logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
+        exit(1)

-    prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
+    prologue = gen_function_start_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)
    prologue += "%s" % (function_entry)
    prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)

@ -413,31 +631,41 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r

 def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
    function_end = None
-    if assembler == "masm":
+    if assembler in ["masm", "armasm64"]:
        # need to identify and mark up frame functions in masm
        if nested or (reg_count > calling_convention.volatile_registers):
            function_end = MASM_FRAME_FUNCTION_END % (function_name)
        else:
            function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
+
+        if assembler == "masm":
+            function_end = MASM_FUNCTION_TEMPLATE % function_end
+        elif assembler == "armasm64":
+            function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
    elif assembler == "gas":
        function_end = GAS_FUNCTION_END
+    else:
+        logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
+        exit(1)

    epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
    epilogue += "%s" % (function_end)
-    epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
+    epilogue += gen_function_end_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)

    return epilogue

 MASM_MACRO_START    = "%s MACRO %s\n"
 MASM_MACRO_END      = "ENDM\n"
+ARMASM64_MACRO_START= "    MACRO\n    %s %s"
+ARMASM64_MACRO_END  = "    MEND\n"
 GAS_MACRO_START     = ".macro %s %s\n"
 GAS_MACRO_END       = ".endm\n"
 MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
 GAS_ALTERNATE_ENTRY = "%s: .global %s\n"


-FUNCTION_START_PATTERN  = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
-FUNCTION_END_PATTERN    = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
+FUNCTION_START_PATTERN  = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
+FUNCTION_END_PATTERN    = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*\)")
 GET_MEMSLOT_PATTERN     = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
 ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
 MACRO_START_PATTERN     = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
@ -499,29 +727,41 @@ class ProcessingStateMachine:
        self.arg_count = int(match.groups()[-2])
        self.reg_count = int(match.groups()[-1])

+        if self.is_nested_function and self.nested_calling_convention is None:
+            logging.error(
+                "symcryptasm nested functions are not currently supported with assembler (%s) and architecture (%s)!\n\t"
+                "%s (line %d)"
+                % (self.assembler, self.normal_calling_convention.architecture, line, line_num))
+            exit(1)
+        if self.is_mul_function and self.mul_calling_convention is None:
+            logging.error(
+                "symcryptasm mul functions are not supported with assembler (%s) and architecture (%s)!\n\t"
+                "%s (line %d)"
+                % (self.assembler, self.normal_calling_convention.architecture, line, line_num))
+            exit(1)
        if self.is_nested_function and self.is_mul_function:
            logging.error(
                "Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
                "%s (line %d)"
                % (line, line_num))
            exit(1)
-        if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
+        if self.arg_count > self.normal_calling_convention.max_arguments:
            logging.error(
-                "Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
+                "Too many (%d) arguments for symcryptasm function - only %d arguments are supported by calling convention (%s)\n\t"
                "%s (line %d)"
-                % (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
+                % (self.arg_count, self.normal_calling_convention.max_arguments, self.normal_calling_convention.name, match.group(0), line_num))
            exit(1)
-        if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
+        if self.reg_count > len(self.normal_calling_convention.mapping):
            logging.error(
-                "Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
+                "Too many (%d) registers required for symcryptasm function - only %d registers are mapped by calling convention (%s)\n\t"
                "%s (line %d)"
-                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
+                % (self.reg_count, len(self.normal_calling_convention.mapping), self.normal_calling_convention.name, match.group(0), line_num))
            exit(1)
-        if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
+        if self.is_mul_function and self.reg_count > len(self.mul_calling_convention.mapping)-1:
            logging.error(
-                "Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
+                "Too many (%d) registers required for symcryptasm mul function - only %d registers are mapped by calling convention (%s)\n\t"
                "%s (line %d)"
-                % (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
+                % (self.reg_count, len(self.mul_calling_convention.mapping)-1, self.mul_calling_convention.name, match.group(0), line_num))
            exit(1)

        logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
@ -546,10 +786,18 @@ class ProcessingStateMachine:
            return MASM_MACRO_START % (self.macro_name, match.group(2))
        elif self.assembler == "gas":
            return GAS_MACRO_START % (self.macro_name, match.group(2))
+        elif self.assembler == "armasm64":
+            # In armasm64 we need to escape all macro arguments with $
+            prefixed_args = ", $".join(self.macro_args)
+            if prefixed_args:
+                prefixed_args = "$" + prefixed_args
+            return ARMASM64_MACRO_START % (self.macro_name, prefixed_args)
+        else:
+            logging.error("Unhandled assembler (%s) in process_start_macro" % assembler)
+            exit(1)

    def process_function_line(self, line, line_num):
        # Currently in a function
-
        match = ALTERNATE_ENTRY_PATTERN.match(line)
        if (match):
            if self.assembler == "masm":
@ -562,12 +810,12 @@ class ProcessingStateMachine:
            # Check the end function has same prefix as previous start function
            if  (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
                (self.is_mul_function ^ (match.group(2) == "MUL_")):
-                logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)" \
                    % (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
                exit(1)
            # Check the end function pattern has the same label as the previous start function pattern
            if self.function_name != match.groups()[-1]:
-                logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)"
+                logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd:   %s (line %d)" \
                    % (self.function_name, self.function_start_line, match.groups()[-1], line_num))
                exit(1)

@ -613,8 +861,18 @@ class ProcessingStateMachine:
                return MASM_MACRO_END
            elif self.assembler == "gas":
                return GAS_MACRO_END
+            elif self.assembler == "armasm64":
+                return ARMASM64_MACRO_END
+            else:
+                logging.error("Unhandled assembler (%s) in process_macro_line" % self.assembler)
+                exit(1)

-        if self.assembler == "gas":
+
+        if self.assembler == "armasm64":
+            # In armasm64 macros we need to escape all of the macro arguments with a $ in the macro body
+            for arg in self.macro_args:
+                line = re.sub(arg, "$%s" % arg, line)
+        elif self.assembler == "gas":
            # In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
            for arg in self.macro_args:
                line = re.sub(arg, r"\\%s" % arg, line)
@ -622,18 +880,40 @@ class ProcessingStateMachine:
        # Not modifying the line any further
        return line

-def process_file(target, infilename, outfilename):
-    assembler = None
-    if target == "masm":
-        assembler = "masm"
-        normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
-        mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
-        nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
-    elif target == "gas":
-        assembler = "gas"
-        normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
-        mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
-        nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
+def process_file(assembler, architecture, calling_convention, infilename, outfilename):
+    normal_calling_convention = None
+
+    if assembler == "masm":
+        if architecture == "amd64" and calling_convention == "msft":
+            normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
+            mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
+            nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
+    elif assembler == "gas":
+        if architecture == "amd64" and calling_convention == "systemv":
+            normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
+            mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
+            nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
+        elif architecture == "arm64" and calling_convention == "aapcs64":
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            mul_calling_convention = None
+            nested_calling_convention = None
+    elif assembler == "armasm64":
+        if architecture == "arm64" and calling_convention == "aapcs64":
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            mul_calling_convention = None
+            nested_calling_convention = None
+        elif architecture == "arm64" and calling_convention == "arm64ec":
+            normal_calling_convention = CALLING_CONVENTION_ARM64EC_MSFT
+            mul_calling_convention = None
+            nested_calling_convention = None
+    else:
+        logging.error("Unhandled assembler (%s) in process_file" % assembler)
+        exit(1)
+
+    if normal_calling_convention is None:
+        logging.error("Unhandled combination (%s + %s + %s) in process_file"
+            % (assembler, architecture, calling_convention))
+        exit(1)

    # iterate through file line by line in one pass
    file_processing_state = ProcessingStateMachine(
@ -649,9 +929,11 @@ if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
-    parser.add_argument('target', type=str, help='Target that we want to preprocess for')
+    parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
+    parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
+    parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
    parser.add_argument('inputfile', type=str, help='Path to input file')
    parser.add_argument('outputfile', type=str, help='Path to output file')

    args = parser.parse_args()
-    process_file(args.target, args.inputfile, args.outputfile)
+    process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)
				`@ -1 +0,0 @@`
				`#include "..\arm64\symcrypt_name_mangling.inc"`