diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b44288..c933ed0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,10 @@
 New changes will be listed here as they are developed. The version number is determined
 prior to the creation of a new release, based on the changes contained in that release.
 
+# Version 103.4.3
+
 - Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
+- Performance improvements for RSA for modern Arm64 microarchitecture
 
 # Version 103.4.2
 
diff --git a/lib/arm64/fdef369_asm.symcryptasm b/lib/arm64/fdef369_asm.symcryptasm
index c369481..f8076fb 100644
--- a/lib/arm64/fdef369_asm.symcryptasm
+++ b/lib/arm64/fdef369_asm.symcryptasm
@@ -331,7 +331,7 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
     mov     X_16, X_0                   // Store the pMod pointer
     mov     X_17, X_1                   // Store the pSrc pointer
 
-    and     X_7, X_7, xzr               // Set hc to 0
+    mov     X_7, xzr                    // Set hc to 0
 
     //
     // Main loop
@@ -340,7 +340,7 @@ LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
     ldr     X_8, [X_1, #24]             // Load 1 word from pSrc
     mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
 
-    and     X_12, X_12, xzr             // Set c to 0
+    mov     X_12, xzr                   // Set c to 0
 
 LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
     ldp     X_10, X_11, [X_0, #24]!     // pMod[j]
diff --git a/lib/arm64/fdef_asm.symcryptasm b/lib/arm64/fdef_asm.symcryptasm
index c85251e..ebb8744 100644
--- a/lib/arm64/fdef_asm.symcryptasm
+++ b/lib/arm64/fdef_asm.symcryptasm
@@ -308,13 +308,14 @@ MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_
     ldr     scratch2, [dst_reg, #8*index]   // pDst[2*(i+j)]
 
     mul     scratch1, mul_word, scratch0    // Bits <63:0> of pSrc[i]*pSrc[i+j]
-    adds    scratch1, scratch1, src_carry   // Adding the previous word
     umulh   dst_carry, mul_word, scratch0   // Bits <127:64> of pSrc[i]*pSrc[i+j]
-    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
 
     adds    scratch1, scratch1, scratch2    // Add the word from the destination
     adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
 
+    adds    scratch1, scratch1, src_carry   // Adding the previous word
+    adc     dst_carry, dst_carry, xzr       // Add the intermediate carry and don't update the flags
+
     str     scratch1, [dst_reg, #8*index]   // Store to destination
 
 MACRO_END()
@@ -384,9 +385,9 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
     //
     // First iteration of main loop (no adding of previous values from pDst)
     //
-    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    mov     X_12, xzr                   // set X_12 = 0
     ldr     X_6, [X_0]                  // load the first word from pSrc1
-    str     X_12, [X_4]                 // store 0 for the first word
+    str     xzr, [X_4]                  // store 0 for the first word
 
     b       SymCryptFdefRawSquareAsmInnerLoopInit_Word1
 
@@ -423,7 +424,8 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
     mov     X_2, X_0                    // set pSrc
     mov     X_4, X_5                    // set pDst
 
-    ands    X_12, X_12, xzr             // Clearing the carry flag and setting X_12 = 0
+    mov     X_11, xzr                   // set X_11 = 0
+    mov     X_12, xzr                   // set X_12 = 0
     ldr     X_6, [X_0, X_8, LSL #3]     // load the next word from pSrc
 
     // Cyclic counter and jump logic
@@ -448,16 +450,16 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
     mov     X_3, X_14               // set the new digit counter
 
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
-    SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
 
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
-    SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
 
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
-    SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
 
 LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
-    SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
+    SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
 
     sub     X_3, X_3, #1                // move one digit up
     add     X_2, X_2, #32
@@ -470,8 +472,7 @@ LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
     sub     X_1, X_1, #1                // move one word up
     cbnz    X_1, SymCryptFdefRawSquareAsmOuterLoop
 
-    ands    X_12, X_12, xzr             // Setting X_12 = 0
-    str     X_12, [X_5, #40]            // Store 0 to destination for the top word
+    str     xzr, [X_5, #40]             // Store 0 to destination for the top word
 
     ////////////////////////////////////////////////////////////////
     // Second Pass - Shifting all results 1 bit left
@@ -1240,136 +1241,156 @@ FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefModSquareMontgomeryP384Asm))
 //       X_0  = pMod (moving forward one *digit* every inner loop)
 //       X_1  = pSrc (moving forward one *digit* every inner loop)
 //       X_2  = pDst (used only in the end for subtract / result)
-//       X_3  = digit count of pSrc and pMod
+//       X_3  = byte count of pSrc and pMod (sometimes byte count - 1 digit of bytes)
 //       X_4  = word count of pSrc
 //       X_5  = Inv64 of the modulus
 //       X_6  = m = pSrc[i]*Inv64
 //       X_7  = hc = high carry variable
-//       X_8, X_9   = Current words loaded in pairs from pSrc
-//       X_10, X_11 = Current words loaded in pairs from pMod
-//       X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
-//                  It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
-//       X_14 = Temporary intermediate result
-//       X_15 = Stored digit count of pSrc
-//       X_16 = Stored pMod pointer
-//       X_17 = Stored pSrc pointer (moving forward one word every outer loop)
+//       X_8-X_19 = temporaries
+//       X_20 = c
+//       X_21 = Stored byte count of pSrc
 
-FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
+FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 22)
 
-    ldr     W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64]  // # of Digits
+    ldr     W_4, [X_0, #SymCryptModulusNdigitsOffsetArm64]  // # of Digits
     ldr     X_5, [X_0, #SymCryptModulusInv64OffsetArm64]    // Inv64 of modulus
     add     X_0, X_0, #SymCryptModulusValueOffsetArm64      // pMod
 
-    lsl     X_4, X_3, #2                // Multiply by 4 to get the number of words
+    lsl     X_3, X_4, #5                // Multiply by 32 to get the number of bytes
+    lsl     X_4, X_4, #2                // Multiply by 4 to get the number of words
 
-    sub     X_0, X_0, #32               // offset pMod so we can use pre-increment form of loads
-    sub     X_1, X_1, #32               // offset pSrc so we can use pre-increment form of loads
-    sub     X_2, X_2, #32               // offset pDst so we can use pre-increment form of loads
+    sub     X_3, X_3, #32               // offset the byte count to make it easy to restore pointers
 
-    mov     X_15, X_3                   // Store the digit count for later
-    mov     X_16, X_0                   // Store the pMod pointer
-    mov     X_17, X_1                   // Store the pSrc pointer
-
-    and     X_7, X_7, xzr               // Set hc to 0
+    mov     X_7, xzr                    // Set hc to 0
+    mov     X_21, X_3                   // Store the byte count for later
 
     //
     // Main loop
     //
 LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
-    ldr     X_8, [X_1, #32]             // Load 1 word from pSrc
-    mul     X_6, X_8, X_5               // <63:0> bits of pSrc[i]*Inv64 = m
+    ldp     X_16, X_17, [X_1]           // pSrc[i+0], pSrc[i+1]
+    mul     X_6, X_16, X_5              // <63:0> bits of pSrc[i]*Inv64 = m
 
-    and     X_12, X_12, xzr             // Set c to 0
+    ldp     X_8,  X_9,  [X_0]           // pMod[0], pMod[1]
+    umulh   X_8,  X_6, X_8              // Bits <127:64> of m*pMod[0]
+    subs    xzr,  X_16, #1              // Subtract 1 from pSrc[i+0] (generate a carry iff pSrc[i] is non-zero)
+
+    mul     X_13, X_6, X_9              // Bits <63:0> of m*pMod[1]
+    umulh   X_9,  X_6, X_9              // Bits <127:64> of m*pMod[1]
+    adcs    X_8,  X_8,  X_13            // Adding the previous word (if there was a carry from the last addition it is added)
+
+    ldp     X_10, X_11, [X_0, #16]      // pMod[2], pMod[3]
+    mul     X_14, X_6, X_10             // Bits <63:0> of m*pMod[2]
+    umulh   X_10, X_6, X_10             // Bits <127:64> of m*pMod[2]
+    adcs    X_9,  X_9,  X_14            // Adding the previous word (if there was a carry from the last addition it is added)
+
+    ldp     X_18, X_19, [X_1, #16]      // pSrc[i+2], pSrc[i+3]
+    mul     X_15, X_6, X_11             // Bits <63:0> of m*pMod[3]
+    umulh   X_11, X_6, X_11             // Bits <127:64> of m*pMod[3]
+    adcs    X_10, X_10, X_15            // Adding the previous word (if there was a carry from the last addition it is added)
+
+    adc     X_20, X_11, xzr             // Add the carry if any
+
+    // ( X_20, X_10, X_9, X_8, ___ ) = (( pMod[3], pMod[2], pMod[1], pMod[0] ) * m)
+    // We do not compute this --^ because (m*pMod[0] + pSrc[i]) == 0. We already accounted for the carry above.
+
+    adds    X_17, X_17, X_8             // Adding pSrc[i+1]
+    adcs    X_18, X_18, X_9             // Adding pSrc[i+2]
+    adcs    X_19, X_19, X_10            // Adding pSrc[i+3]
+
+    stp     xzr,  X_17, [X_1]           // pSrc[i+0], pSrc[i+1]
+    stp     X_18, X_19, [X_1, #16]      // pSrc[i+2], pSrc[i+3]
+
+    cbz    X_3, SymCryptFdefMontgomeryReduceAsmInnerEnd
 
 LABEL(SymCryptFdefMontgomeryReduceAsmInner)
-    ldp     X_10, X_11, [X_0, #32]!     // pMod[j]
-    ldp     X_8, X_9, [X_1, #32]!       // pSrc[j]
+    // Invariant - X_20 (c) + carry flag is a value in the range [0, 2^64-1]
+    // We could always adc X_20, X_20, xzr before loop entry, but this is a wasted instruction
+    // vs. adcs with X_20 below
 
-    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
-    adds    X_14, X_14, X_8             // Adding pSrc[j]
-    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
-    adds    X_12, X_12, X_14            // Add the lower bits of c
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
-    // ***: These cannot produce extra carry as the maximum is
-    //      (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
-    str     X_12, [X_1]                 // pSrc[j] = (UINT64) c
+    ldp     X_8,  X_9,  [X_0, #32]!     // pMod[j+0], pMod[j+1]
+    mul     X_12, X_6, X_8              // Bits <63:0> of m*pMod[j]
+    umulh   X_8,  X_6, X_8              // Bits <127:64> of m*pMod[j]
+    adcs    X_12, X_20, X_12            // Adding the previous word (if there was a carry from the last addition it is added)
 
-    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
-    adds    X_14, X_14, X_9             // Adding pSrc[j]
-    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
-    adc     X_12, X_12, xzr             // Add the carry if any (***)
-    adds    X_13, X_13, X_14            // Add the lower bits of c
-    adc     X_12, X_12, xzr             // Add the carry if any (***)
-    str     X_13, [X_1, #8]             // pSrc[j] = (UINT64) c
+    mul     X_13, X_6, X_9              // Bits <63:0> of m*pMod[j+1]
+    umulh   X_9,  X_6, X_9              // Bits <127:64> of m*pMod[j+1]
+    ldp     X_16, X_17, [X_1, #32]!     // pSrc[i+j+0], pSrc[i+j+1]
+    adcs    X_8,  X_8,  X_13            // Adding the previous word (if there was a carry from the last addition it is added)
 
-    ldp     X_10, X_11, [X_0, #16]      // pMod[j]
-    ldp     X_8, X_9, [X_1, #16]        // pSrc[j]
+    ldp     X_10, X_11, [X_0, #16]      // pMod[j+2], pMod[j+3]
+    mul     X_14, X_6, X_10             // Bits <63:0> of m*pMod[j+2]
+    umulh   X_10, X_6, X_10             // Bits <127:64> of m*pMod[j+2]
+    adcs    X_9,  X_9,  X_14            // Adding the previous word (if there was a carry from the last addition it is added)
 
-    mul     X_14, X_6, X_10             // <63:0> of pMod[j]*m
-    adds    X_14, X_14, X_8             // Adding pSrc[j]
-    umulh   X_13, X_6, X_10             // <127:64> of pMod[j]*m
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
-    adds    X_12, X_12, X_14            // Add the lower bits of c
-    adc     X_13, X_13, xzr             // Add the carry if any (***)
-    str     X_12, [X_1, #16]            // pSrc[j] = (UINT64) c
+    ldp     X_18, X_19, [X_1, #16]      // pSrc[i+j+2], pSrc[i+j+3]
+    mul     X_15, X_6, X_11             // Bits <63:0> of m*pMod[j+3]
+    umulh   X_11, X_6, X_11             // Bits <127:64> of m*pMod[j+3]
+    adcs    X_10, X_10, X_15            // Adding the previous word (if there was a carry from the last addition it is added)
 
-    mul     X_14, X_6, X_11             // <63:0> of pMod[j]*m
-    adds    X_14, X_14, X_9             // Adding pSrc[j]
-    umulh   X_12, X_6, X_11             // <127:64> of pMod[j]*m
-    adc     X_12, X_12, xzr             // Add the carry if any (***)
-    adds    X_13, X_13, X_14            // Add the lower bits of c
-    adc     X_12, X_12, xzr             // Add the carry if any (***)
-    str     X_13, [X_1, #24]            // pSrc[j] = (UINT64) c
+    adc     X_20, X_11, xzr             // Add the carry if any
 
-    subs    X_3, X_3, #1                // Move one digit up
-    bne     SymCryptFdefMontgomeryReduceAsmInner
+    // ( X_20, X_10, X_9, X_8, X_12 ) = (( pMod[j+3], pMod[j+2], pMod[j+1], pMod[j+0] ) * m) + c
+
+    adds    X_16, X_16, X_12            // Adding pSrc[i+j+0]
+    adcs    X_17, X_17, X_8             // Adding pSrc[i+j+1]
+    adcs    X_18, X_18, X_9             // Adding pSrc[i+j+2]
+    adcs    X_19, X_19, X_10            // Adding pSrc[i+j+3]
+
+    stp     X_16, X_17, [X_1]           // pSrc[i+j+0], pSrc[i+j+1]
+    stp     X_18, X_19, [X_1, #16]      // pSrc[i+j+2], pSrc[i+j+3]
+
+    sub     X_3, X_3, #32               // Move one digit up
+    cbnz    X_3, SymCryptFdefMontgomeryReduceAsmInner
+
+LABEL(SymCryptFdefMontgomeryReduceAsmInnerEnd)
 
     ldr     X_8, [X_1, #32]             // pSrc[nWords]
-    adds    X_12, X_12, X_8             // c + pSrc[nWords]
-    adc     X_13, xzr, xzr              // Add the carry if any
+    adcs    X_20, X_20, X_7             // c + hc
+    adc     X_7, xzr, xzr               // Add the carry if any
 
-    adds    X_12, X_12, X_7             // c + pSrc[nWords] + hc
-    adc     X_7, X_13, xzr              // Add the carry if any and store into hc
+    adds    X_20, X_20, X_8             // c + hc + pSrc[nWords]
+    adc     X_7, X_7, xzr               // Add the carry if any and store into hc
 
-    str     X_12, [X_1, #32]            // pSrc[nWords] = c
+    str     X_20, [X_1, #32]            // pSrc[nWords] = c
 
     subs    X_4, X_4, #1                // Move one word up
 
-    add     X_17, X_17, #8              // Move stored pSrc pointer one word up
-    mov     X_0, X_16                   // Restore pMod pointer
-    mov     X_1, X_17                   // Restore pSrc pointer
+    sub     X_0, X_0, X_21              // Restore pMod pointer (subtract byte count)
+    sub     X_1, X_1, X_21              // Restore pSrc pointer (subtract byte count)
+    add     X_1, X_1, #8                // Move pSrc pointer one word up
 
-    mov     X_3, X_15                   // Restore the digit counter
+    mov     X_3, X_21                   // Restore the byte counter
 
     bne     SymCryptFdefMontgomeryReduceAsmOuter
 
     //
     // Subtraction
     //
+    add     X_3, X_3, #32           // restore the full byte count for loops using unconditional pre-indexing
+    mov     X_21, X_3               // Store the byte count for later
 
+    sub     X_0, X_0, #32           // offset pMod so we can use pre-increment form of loads
+    sub     X_1, X_1, #32           // offset pSrc so we can use pre-increment form of loads
+    sub     X_2, X_2, #32           // offset pDst so we can use pre-increment form of loads
     mov     X_14, X_2               // Store pDst pointer
 
     // Prepare the pointers for subtract
-    mov     X_0, X_17               // pSrc
-    mov     X_1, X_16               // pMod
-
     mov     X_10, X_7               // X_10 = hc
-    mov     X_3, X_15               // Restore the digit counter
     subs    X_4, X_4, X_4           // Set the carry flag (i.e. no borrow)
 
 LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
-    sub     X_3, X_3, #1            // Decrement the digit count by one
+    sub     X_3, X_3, #32           // Decrement the byte count by 32
     // borrow is in the carry flag (flipped)
 
-    ldp     X_4, X_6, [X_0, #32]!   // Load two words of pSrc1
-    ldp     X_5, X_7, [X_1, #32]!   // Load two words of pSrc2
+    ldp     X_4, X_6, [X_1, #32]!   // Load two words of pSrc
+    ldp     X_5, X_7, [X_0, #32]!   // Load two words of pMod
     sbcs    X_4, X_4, X_5
     sbcs    X_6, X_6, X_7
     stp     X_4, X_6, [X_2, #32]!   // Store the result in the destination
 
-    ldp     X_4, X_6, [X_0, #16]    // Load two words of pSrc1
-    ldp     X_5, X_7, [X_1, #16]    // Load two words of pSrc2
+    ldp     X_4, X_6, [X_1, #16]    // Load two words of pSrc
+    ldp     X_5, X_7, [X_0, #16]    // Load two words of pMod
     sbcs    X_4, X_4, X_5
     sbcs    X_6, X_6, X_7
     stp     X_4, X_6, [X_2, #16]    // Store the result in the destination
@@ -1381,14 +1402,14 @@ LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
     orr     X_11, X_10, X_0         // X_11 = hc|d
 
     // Prepare the pointers for masked copy
-    mov     X_0, X_17               // pSrc
+    sub     X_0, X_1, X_21          // Restore pSrc pointer (subtract byte count)
     mov     X_1, X_14               // pDst
 
-    mov     X_2, X_15               // Restore the digit counter
+    mov     X_2, X_21               // Restore the byte counter
     subs    X_4, X_10, X_11         // If (X_11 > X_10) clear the carry flag (i.e. borrow)
 
 LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
-    sub     X_2, X_2, #1            // decrement the digit count by one
+    sub     X_2, X_2, #32           // decrement the byte count by 32
 
     ldp     X_4, X_6, [X_0, #32]!   // Load two words of the source
     ldp     X_5, X_7, [X_1, #32]!   // Load two words of the destination
diff --git a/scripts/symcryptasm_processor.py b/scripts/symcryptasm_processor.py
index 0e0dea7..2f730be 100755
--- a/scripts/symcryptasm_processor.py
+++ b/scripts/symcryptasm_processor.py
@@ -645,26 +645,6 @@ MAPPING_ARM32_AAPCS32 = {
     15:ARM32_R15, # PC
 }
 
-def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
-    prologue = ""
-
-    if reg_count > self.volatile_registers:
-        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
-        exit(1)
-
-    return prologue
-
-def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
-    epilogue = ""
-
-    if reg_count > self.volatile_registers:
-        logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
-        exit(1)
-
-    epilogue += "    ret\n"
-
-    return epilogue
-
 def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
     assert(not stack_alloc_size and not xmm_reg_count)
     prologue = ""
@@ -697,7 +677,7 @@ def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
     epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
     return epilogue
 
-def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+def gen_prologue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
     prologue = ""
 
     if reg_count > self.volatile_registers:
@@ -717,7 +697,7 @@ def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
 
     return prologue
 
-def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+def gen_epilogue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
     epilogue = ""
 
     if reg_count > self.volatile_registers:
@@ -741,17 +721,63 @@ def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
 
     return epilogue
 
+def gen_prologue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+    prologue = ""
+
+    if reg_count > self.volatile_registers:
+        # Calculate required stack space
+        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
+        registers_to_spill = 2 + reg_count - self.volatile_registers
+        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
+        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
+        prologue += "    stp fp, lr, [sp, #-%d]! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
+
+        stack_offset = 16
+        for i in range(self.volatile_registers, reg_count-1, 2):
+            prologue += "    stp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
+            stack_offset += 16
+        if registers_to_spill % 2 == 1:
+            prologue += "    str      X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
+
+    return prologue
+
+def gen_epilogue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+    epilogue = ""
+
+    if reg_count > self.volatile_registers:
+        # Calculate required stack space
+        # If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
+        registers_to_spill = 2 + reg_count - self.volatile_registers
+        # Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
+        required_stack_space = 16 * ((registers_to_spill + 1) // 2)
+
+        stack_offset = required_stack_space-16
+        if registers_to_spill % 2 == 1:
+            epilogue += "    ldr      X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
+            stack_offset -= 16
+        for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
+            epilogue += "    ldp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
+            stack_offset -= 16
+        epilogue += "    ldp fp, lr, [sp], #%d // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
+    epilogue += "    ret\n"
+
+    return epilogue
+
 def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
     logging.error("symcryptasm currently does not support memory slots for arm64!")
     exit(1)
 
-CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
+CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64 = CallingConvention(
     "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
-    gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
+    gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
+
+CALLING_CONVENTION_ARM64_AAPCS64_GAS = CallingConvention(
+    "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
+    gen_prologue_arm64_gas, gen_epilogue_arm64_gas, gen_get_memslot_offset_arm64)
 
 CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
     "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
-    gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
+    gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
 
 CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
     "arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
@@ -1194,7 +1220,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
             mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
             nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
         elif architecture == "arm64" and calling_convention == "aapcs64":
-            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_GAS
             mul_calling_convention = None
             nested_calling_convention = None
         elif architecture == "arm" and calling_convention == "aapcs32":
@@ -1203,7 +1229,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
             nested_calling_convention = None
     elif assembler == "armasm64":
         if architecture == "arm64" and calling_convention == "aapcs64":
-            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
+            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64
             mul_calling_convention = None
             nested_calling_convention = None
         elif architecture == "arm64" and calling_convention == "arm64ec":
diff --git a/version.json b/version.json
index 974d085..05e5c9e 100644
--- a/version.json
+++ b/version.json
@@ -1 +1 @@
-{ "major": 103, "minor": 4, "patch": 2 }
\ No newline at end of file
+{ "major": 103, "minor": 4, "patch": 3 }
\ No newline at end of file