Merged PR 11150425: Arm64 server perf work

## Description: + Improve `SymCryptFdefMontgomeryReduceAsm` + Reduce instruction count in the inner loop - remove superfluous `adc` with zero + Special case first iteration of the reduction loop to further reduce instruction count and multiplication uops + For ease of phrasing used non-volatile registers in aapcs64 assembly for the first time, and had to slightly extend SymCryptAsm processor script for this. + Improve `SymCryptFdefRawSquareAsm` by tweaking to reduce undue dependencies. + More room for improvements in follow-on PR, but checking in what we have to get improvements before GE deadline. ## Admin Checklist: - [X] You have updated documentation in symcrypt.h to reflect any changes in behavior - [X] You have updated CHANGELOG.md to reflect any changes in behavior - [X] You have updated symcryptunittest to exercise any new functionality - [X] If you have introduced any symbols in symcrypt.h you have updated production and test dynamic export symbols (exports.ver / exports.def / symcrypt.src) and tested the updated dynamic modules with symcryptunittest - [X] If you have introduced functionality that varies based on CPU features, you have manually tested with and without relevant features - [X] If you have made significant changes to a particular algorithm, you have checked that performance numbers reported by symcryptunittest are in line with expectations - [X] If you have added new algorithms/modes, you have updated the status indicator text for the associated modules if necessary
2024-07-26 02:18:13 +00:00 · 2024-07-26 02:18:13 +00:00 · 982858166c
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,7 +3,10 @@
 New changes will be listed here as they are developed. The version number is determined
 prior to the creation of a new release, based on the changes contained in that release.
 # Version 103.4.3
 - Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
 - Performance improvements for RSA for modern Arm64 microarchitecture
 # Version 103.4.2
--- a/lib/arm64/fdef369_asm.symcryptasm
+++ b/lib/arm64/fdef369_asm.symcryptasm
@ -331,7 +331,7 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
    mov     X_16, X_0                   // Store the pMod pointer
    mov     X_17, X_1                   // Store the pSrc pointer
-    and     X_7, X_7, xzr               // Set hc to 0
+    mov     X_7, xzr                    // Set hc to 0
    //
    // Main loop
@ -340,7 +340,7 @@ LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
    ldr     X_8, [X_1, #24]             // Load 1 word from pSrc
    mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
-    and     X_12, X_12, xzr             // Set c to 0
+    mov     X_12, xzr                   // Set c to 0
 LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
    ldp     X_10, X_11, [X_0, #24]!     // pMod[j]
--- a/lib/arm64/fdef_asm.symcryptasm
+++ b/lib/arm64/fdef_asm.symcryptasm
@ -308,13 +308,14 @@ MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_
    ldr     scratch2, [dst_reg, #8*index]   // pDst[2*(i+j)]
    mul     scratch1, mul_word, scratch0    // Bits <63:0> of pSrc[i]*pSrc[i+j]
    adds    scratch1, scratch1, src_carry   // Adding the previous word
    umulh   dst_carry, mul_word, scratch0   // Bits <127:64> of pSrc[i]*pSrc[i+j]
    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
    adds    scratch1, scratch1, scratch2    // Add the word from the destination
    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
    adds    scratch1, scratch1, src_carry   // Adding the previous word
    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
    str     scratch1, [dst_reg, #8*index]   // Store to destination
 MACRO_END()
@ -384,9 +385,9 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
    //
    // First iteration of main loop (no adding of previous values from pDst)
    //
-    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    mov     X_12, xzr                   // set X_12 = 0
    ldr     X_6, [X_0]                  // load the first word from pSrc1
-    str     X_12, [X_4]                 // store 0 for the first word
+    str     xzr, [X_4]                  // store 0 for the first word
    b       SymCryptFdefRawSquareAsmInnerLoopInit_Word1
@ -423,7 +424,8 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
    mov     X_2, X_0                    // set pSrc
    mov     X_4, X_5                    // set pDst
-    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    mov     X_11, xzr                   // set X_11 = 0
    mov     X_12, xzr                   // set X_12 = 0
    ldr     X_6, [X_0, X_8, LSL #3]     // load the next word from pSrc
    // Cyclic counter and jump logic
@ -448,16 +450,16 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
    mov     X_3, X_14               // set the new digit counter
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
-    SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
-    SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
-    SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
-    SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
    sub     X_3, X_3, #1                // move one digit up
    add     X_2, X_2, #32
@ -470,8 +472,7 @@ LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
    sub     X_1, X_1, #1                // move one word up
    cbnz    X_1, SymCryptFdefRawSquareAsmOuterLoop
-    ands    X_12, X_12, xzr             // Setting X_12 = 0
+    str     xzr, [X_5, #40]             // Store 0 to destination for the top word
    str     X_12, [X_5, #40]            // Store 0 to destination for the top word
    ////////////////////////////////////////////////////////////////
    // Second Pass - Shifting all results 1 bit left
@ -1240,136 +1241,156 @@ FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefModSquareMontgomeryP384Asm))
 //       X_0  = pMod (moving forward one *digit* every inner loop)
 //       X_1  = pSrc (moving forward one *digit* every inner loop)
 //       X_2  = pDst (used only in the end for subtract / result)
-//       X_3  = digit count of pSrc and pMod
+//       X_3  = byte count of pSrc and pMod (sometimes byte count - 1 digit of bytes)
 //       X_4  = word count of pSrc
 //       X_5  = Inv64 of the modulus
 //       X_6  = m = pSrc[i]*Inv64
 //       X_7  = hc = high carry variable
-//       X_8, X_9   = Current words loaded in pairs from pSrc
+//       X_8-X_19 = temporaries
-//       X_10, X_11 = Current words loaded in pairs from pMod
+//       X_20 = c
-//       X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
+//       X_21 = Stored byte count of pSrc
 //                  It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
 //       X_14 = Temporary intermediate result
 //       X_15 = Stored digit count of pSrc
 //       X_16 = Stored pMod pointer
 //       X_17 = Stored pSrc pointer (moving forward one word every outer loop)
-FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 22)
-    ldr     W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64]  // # of Digits
+    ldr     W_4, [X_0, #SymCryptModulusNdigitsOffsetArm64]  // # of Digits
    ldr     X_5, [X_0, #SymCryptModulusInv64OffsetArm64]    // Inv64 of modulus
    add     X_0, X_0, #SymCryptModulusValueOffsetArm64      // pMod
-    lsl     X_4, X_3, #2                // Multiply by 4 to get the number of words
+    lsl     X_3, X_4, #5                // Multiply by 32 to get the number of bytes
    lsl     X_4, X_4, #2                // Multiply by 4 to get the number of words
-    sub     X_0, X_0, #32               // offset pMod so we can use pre-increment form of loads
+    sub     X_3, X_3, #32               // offset the byte count to make it easy to restore pointers
    sub     X_1, X_1, #32               // offset pSrc so we can use pre-increment form of loads
    sub     X_2, X_2, #32               // offset pDst so we can use pre-increment form of loads
-    mov     X_15, X_3                   // Store the digit count for later
+    mov     X_7, xzr                    // Set hc to 0
-    mov     X_16, X_0                   // Store the pMod pointer
+    mov     X_21, X_3                   // Store the byte count for later
    mov     X_17, X_1                   // Store the pSrc pointer
    and     X_7, X_7, xzr               // Set hc to 0
    //
    // Main loop
    //
 LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
-    ldr     X_8, [X_1, #32]             // Load 1 word from pSrc
+    ldp     X_16, X_17, [X_1]           // pSrc[i+0], pSrc[i+1]
-    mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
+    mul     X_6, X_16, X_5              // <63:0> bits of pSrc[i]*Inv64 = m
-    and     X_12, X_12, xzr             // Set c to 0
+    ldp     X_8,  X_9,  [X_0]           // pMod[0], pMod[1]
    umulh   X_8,  X_6, X_8              // Bits <127:64> of m*pMod[0]
    subs    xzr,  X_16, #1              // Subtract 1 from pSrc[i+0] (generate a carry iff pSrc[i] is non-zero)
    mul     X_13, X_6, X_9              // Bits <63:0> of m*pMod[1]
    umulh   X_9,  X_6, X_9              // Bits <127:64> of m*pMod[1]
    adcs    X_8,  X_8,  X_13            // Adding the previous word (if there was a carry from the last addition it is added)
    ldp     X_10, X_11, [X_0, #16]      // pMod[2], pMod[3]
    mul     X_14, X_6, X_10             // Bits <63:0> of m*pMod[2]
    umulh   X_10, X_6, X_10             // Bits <127:64> of m*pMod[2]
    adcs    X_9,  X_9,  X_14            // Adding the previous word (if there was a carry from the last addition it is added)
    ldp     X_18, X_19, [X_1, #16]      // pSrc[i+2], pSrc[i+3]
    mul     X_15, X_6, X_11             // Bits <63:0> of m*pMod[3]
    umulh   X_11, X_6, X_11             // Bits <127:64> of m*pMod[3]
    adcs    X_10, X_10, X_15            // Adding the previous word (if there was a carry from the last addition it is added)
    adc     X_20, X_11, xzr             // Add the carry if any
    // ( X_20, X_10, X_9, X_8, ___ ) = (( pMod[3], pMod[2], pMod[1], pMod[0] ) * m)
    // We do not compute this --^ because (m*pMod[0] + pSrc[i]) == 0. We already accounted for the carry above.
    adds    X_17, X_17, X_8             // Adding pSrc[i+1]
    adcs    X_18, X_18, X_9             // Adding pSrc[i+2]
    adcs    X_19, X_19, X_10            // Adding pSrc[i+3]
    stp     xzr,  X_17, [X_1]           // pSrc[i+0], pSrc[i+1]
    stp     X_18, X_19, [X_1, #16]      // pSrc[i+2], pSrc[i+3]
    cbz    X_3, SymCryptFdefMontgomeryReduceAsmInnerEnd
 LABEL(SymCryptFdefMontgomeryReduceAsmInner)
-    ldp     X_10, X_11, [X_0, #32]!     // pMod[j]
+    // Invariant - X_20 (c) + carry flag is a value in the range [0, 2^64-1]
-    ldp     X_8, X_9, [X_1, #32]!       // pSrc[j]
+    // We could always adc X_20, X_20, xzr before loop entry, but this is a wasted instruction
    // vs. adcs with X_20 below
-    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    ldp     X_8,  X_9,  [X_0, #32]!     // pMod[j+0], pMod[j+1]
-    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    mul     X_12, X_6, X_8              // Bits <63:0> of m*pMod[j]
-    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    umulh   X_8,  X_6, X_8              // Bits <127:64> of m*pMod[j]
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adcs    X_12, X_20, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
    adds    X_12, X_12, X_14            // Add the lower bits of c
    adc     X_13, X_13, xzr             // Add the carry if any (***)
    // ***: These cannot produce extra carry as the maximum is
    //      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
    str     X_12, [X_1]                 // pSrc[j] = (UINT64) c
-    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
+    mul     X_13, X_6, X_9              // Bits <63:0> of m*pMod[j+1]
-    adds    X_14, X_14, X_9             // Adding pSrc[j]
+    umulh   X_9,  X_6, X_9              // Bits <127:64> of m*pMod[j+1]
-    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
+    ldp     X_16, X_17, [X_1, #32]!     // pSrc[i+j+0], pSrc[i+j+1]
-    adc     X_12, X_12, xzr             // Add the carry if any (***)
+    adcs    X_8,  X_8,  X_13            // Adding the previous word (if there was a carry from the last addition it is added)
    adds    X_13, X_13, X_14            // Add the lower bits of c
    adc     X_12, X_12, xzr             // Add the carry if any (***)
    str     X_13, [X_1, #8]             // pSrc[j] = (UINT64) c
-    ldp     X_10, X_11, [X_0, #16]      // pMod[j]
+    ldp     X_10, X_11, [X_0, #16]      // pMod[j+2], pMod[j+3]
-    ldp     X_8, X_9, [X_1, #16]        // pSrc[j]
+    mul     X_14, X_6, X_10             // Bits <63:0> of m*pMod[j+2]
    umulh   X_10, X_6, X_10             // Bits <127:64> of m*pMod[j+2]
    adcs    X_9,  X_9,  X_14            // Adding the previous word (if there was a carry from the last addition it is added)
-    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
+    ldp     X_18, X_19, [X_1, #16]      // pSrc[i+j+2], pSrc[i+j+3]
-    adds    X_14, X_14, X_8             // Adding pSrc[j]
+    mul     X_15, X_6, X_11             // Bits <63:0> of m*pMod[j+3]
-    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
+    umulh   X_11, X_6, X_11             // Bits <127:64> of m*pMod[j+3]
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
+    adcs    X_10, X_10, X_15            // Adding the previous word (if there was a carry from the last addition it is added)
    adds    X_12, X_12, X_14            // Add the lower bits of c
    adc     X_13, X_13, xzr             // Add the carry if any (***)
    str     X_12, [X_1, #16]            // pSrc[j] = (UINT64) c
-    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
+    adc     X_20, X_11, xzr             // Add the carry if any
    adds    X_14, X_14, X_9             // Adding pSrc[j]
    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
    adc     X_12, X_12, xzr             // Add the carry if any (***)
    adds    X_13, X_13, X_14            // Add the lower bits of c
    adc     X_12, X_12, xzr             // Add the carry if any (***)
    str     X_13, [X_1, #24]            // pSrc[j] = (UINT64) c
-    subs    X_3, X_3, #1                // Move one digit up
+    // ( X_20, X_10, X_9, X_8, X_12 ) = (( pMod[j+3], pMod[j+2], pMod[j+1], pMod[j+0] ) * m) + c
-    bne     SymCryptFdefMontgomeryReduceAsmInner
+
    adds    X_16, X_16, X_12            // Adding pSrc[i+j+0]
    adcs    X_17, X_17, X_8             // Adding pSrc[i+j+1]
    adcs    X_18, X_18, X_9             // Adding pSrc[i+j+2]
    adcs    X_19, X_19, X_10            // Adding pSrc[i+j+3]
    stp     X_16, X_17, [X_1]           // pSrc[i+j+0], pSrc[i+j+1]
    stp     X_18, X_19, [X_1, #16]      // pSrc[i+j+2], pSrc[i+j+3]
    sub     X_3, X_3, #32               // Move one digit up
    cbnz    X_3, SymCryptFdefMontgomeryReduceAsmInner
 LABEL(SymCryptFdefMontgomeryReduceAsmInnerEnd)
    ldr     X_8, [X_1, #32]             // pSrc[nWords]
-    adds    X_12, X_12, X_8             // c + pSrc[nWords]
+    adcs    X_20, X_20, X_7             // c + hc
-    adc     X_13, xzr, xzr              // Add the carry if any
+    adc     X_7, xzr, xzr               // Add the carry if any
-    adds    X_12, X_12, X_7             // c + pSrc[nWords] + hc
+    adds    X_20, X_20, X_8             // c + hc + pSrc[nWords]
-    adc     X_7, X_13, xzr              // Add the carry if any and store into hc
+    adc     X_7, X_7, xzr               // Add the carry if any and store into hc
-    str     X_12, [X_1, #32]            // pSrc[nWords] = c
+    str     X_20, [X_1, #32]            // pSrc[nWords] = c
    subs    X_4, X_4, #1                // Move one word up
-    add     X_17, X_17, #8              // Move stored pSrc pointer one word up
+    sub     X_0, X_0, X_21              // Restore pMod pointer (subtract byte count)
-    mov     X_0, X_16                   // Restore pMod pointer
+    sub     X_1, X_1, X_21              // Restore pSrc pointer (subtract byte count)
-    mov     X_1, X_17                   // Restore pSrc pointer
+    add     X_1, X_1, #8                // Move pSrc pointer one word up
-    mov     X_3, X_15                   // Restore the digit counter
+    mov     X_3, X_21                   // Restore the byte counter
    bne     SymCryptFdefMontgomeryReduceAsmOuter
    //
    // Subtraction
    //
    add     X_3, X_3, #32           // restore the full byte count for loops using unconditional pre-indexing
    mov     X_21, X_3               // Store the byte count for later
    sub     X_0, X_0, #32           // offset pMod so we can use pre-increment form of loads
    sub     X_1, X_1, #32           // offset pSrc so we can use pre-increment form of loads
    sub     X_2, X_2, #32           // offset pDst so we can use pre-increment form of loads
    mov     X_14, X_2               // Store pDst pointer
    // Prepare the pointers for subtract
    mov     X_0, X_17               // pSrc
    mov     X_1, X_16               // pMod
    mov     X_10, X_7               // X_10 = hc
    mov     X_3, X_15               // Restore the digit counter
    subs    X_4, X_4, X_4           // Set the carry flag (i.e. no borrow)
 LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
-    sub     X_3, X_3, #1            // Decrement the digit count by one
+    sub     X_3, X_3, #32           // Decrement the byte count by 32
    // borrow is in the carry flag (flipped)
-    ldp     X_4, X_6, [X_0, #32]!   // Load two words of pSrc1
+    ldp     X_4, X_6, [X_1, #32]!   // Load two words of pSrc
-    ldp     X_5, X_7, [X_1, #32]!   // Load two words of pSrc2
+    ldp     X_5, X_7, [X_0, #32]!   // Load two words of pMod
    sbcs    X_4, X_4, X_5
    sbcs    X_6, X_6, X_7
    stp     X_4, X_6, [X_2, #32]!   // Store the result in the destination
-    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
+    ldp     X_4, X_6, [X_1, #16]    // Load two words of pSrc
-    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    ldp     X_5, X_7, [X_0, #16]    // Load two words of pMod
    sbcs    X_4, X_4, X_5
    sbcs    X_6, X_6, X_7
    stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
@ -1381,14 +1402,14 @@ LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
    orr     X_11, X_10, X_0         // X_11 = hc|d
    // Prepare the pointers for masked copy
-    mov     X_0, X_17               // pSrc
+    sub     X_0, X_1, X_21          // Restore pSrc pointer (subtract byte count)
    mov     X_1, X_14               // pDst
-    mov     X_2, X_15               // Restore the digit counter
+    mov     X_2, X_21               // Restore the byte counter
    subs    X_4, X_10, X_11         // If (X_11 > X_10) clear the carry flag (i.e. borrow)
 LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
-    sub     X_2, X_2, #1            // decrement the digit count by one
+    sub     X_2, X_2, #32           // decrement the byte count by 32
    ldp     X_4, X_6, [X_0, #32]!   // Load two words of the source
    ldp     X_5, X_7, [X_1, #32]!   // Load two words of the destination
--- a/scripts/symcryptasm_processor.py
+++ b/scripts/symcryptasm_processor.py
@ -645,26 +645,6 @@ MAPPING_ARM32_AAPCS32 = {
    15:ARM32_R15, # PC
 }
 def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    prologue = ""
    if reg_count > self.volatile_registers:
        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
        exit(1)
    return prologue
 def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    epilogue = ""
    if reg_count > self.volatile_registers:
        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
        exit(1)
    epilogue += "    ret\n"
    return epilogue
 def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    assert(not stack_alloc_size and not xmm_reg_count)
    prologue = ""
@ -697,7 +677,7 @@ def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
    epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
    return epilogue
-def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+def gen_prologue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    prologue = ""
    if reg_count > self.volatile_registers:
@ -717,7 +697,7 @@ def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
    return prologue
-def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+def gen_epilogue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    epilogue = ""
    if reg_count > self.volatile_registers:
@ -741,17 +721,63 @@ def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
    return epilogue
 def gen_prologue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    prologue = ""
    if reg_count > self.volatile_registers:
        # Calculate required stack space
        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
        registers_to_spill = 2 + reg_count - self.volatile_registers
        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
        prologue += "    stp fp, lr, [sp, #-%d]! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
        stack_offset = 16
        for i in range(self.volatile_registers, reg_count-1, 2):
            prologue += "    stp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
            stack_offset += 16
        if registers_to_spill % 2 == 1:
            prologue += "    str      X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
    return prologue
 def gen_epilogue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    epilogue = ""
    if reg_count > self.volatile_registers:
        # Calculate required stack space
        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
        registers_to_spill = 2 + reg_count - self.volatile_registers
        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
        stack_offset = required_stack_space-16
        if registers_to_spill % 2 == 1:
            epilogue += "    ldr      X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
            stack_offset -= 16
        for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
            epilogue += "    ldp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
            stack_offset -= 16
        epilogue += "    ldp fp, lr, [sp], #%d // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
    epilogue += "    ret\n"
    return epilogue
 def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
    logging.error("symcryptasm currently does not support memory slots for arm64!")
    exit(1)
-CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
+CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64 = CallingConvention(
    "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
-    gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
+    gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
 CALLING_CONVENTION_ARM64_AAPCS64_GAS = CallingConvention(
    "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
    gen_prologue_arm64_gas, gen_epilogue_arm64_gas, gen_get_memslot_offset_arm64)
 CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
    "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
-    gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
+    gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
 CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
    "arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
@ -1194,7 +1220,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
            mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
            nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
        elif architecture == "arm64" and calling_convention == "aapcs64":
-            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_GAS
            mul_calling_convention = None
            nested_calling_convention = None
        elif architecture == "arm" and calling_convention == "aapcs32":
@ -1203,7 +1229,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
            nested_calling_convention = None
    elif assembler == "armasm64":
        if architecture == "arm64" and calling_convention == "aapcs64":
-            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64
            mul_calling_convention = None
            nested_calling_convention = None
        elif architecture == "arm64" and calling_convention == "arm64ec":
--- a/version.json
+++ b/version.json
@ -1 +1 @@
-{ "major": 103, "minor": 4, "patch": 2 }
+{ "major": 103, "minor": 4, "patch": 3 }
		`@ -1 +1 @@`
			`{ "major": 103, "minor": 4, "patch": 2 }`				`{ "major": 103, "minor": 4, "patch": 3 }`