Merged PR 11150425: Arm64 server perf work

## Description:

+ Improve `SymCryptFdefMontgomeryReduceAsm`
  + Reduce instruction count in the inner loop - remove superfluous `adc` with zero
  + Special case first iteration of the reduction loop to further reduce instruction count and multiplication uops
  + For ease of phrasing used non-volatile registers in aapcs64 assembly for the first time, and had to slightly extend SymCryptAsm processor script for this.
+ Improve `SymCryptFdefRawSquareAsm` by tweaking to reduce undue dependencies.

+ More room for improvements in follow-on PR, but checking in what we have to get improvements before GE deadline.

## Admin Checklist:
- [X] You have updated documentation in symcrypt.h to reflect any changes in behavior
- [X] You have updated CHANGELOG.md to reflect any changes in behavior
- [X] You have updated symcryptunittest to exercise any new functionality
- [X] If you have introduced any symbols in symcrypt.h you have updated production and test dynamic export symbols (exports.ver / exports.def / symcrypt.src) and tested the updated dynamic modules with symcryptunittest
- [X] If you have introduced functionality that varies based on CPU features, you have manually tested with and without relevant features
- [X] If you have made significant changes to a particular algorithm, you have checked that performance numbers reported by symcryptunittest are in line with expectations
- [X] If you have added new algorithms/modes, you have updated the status indicator text for the associated modules if necessary
This commit is contained in:
Samuel Lee 2024-07-26 02:18:13 +00:00
Родитель 5e521761ef
Коммит 982858166c
5 изменённых файлов: 171 добавлений и 121 удалений

Просмотреть файл

@ -3,7 +3,10 @@
New changes will be listed here as they are developed. The version number is determined New changes will be listed here as they are developed. The version number is determined
prior to the creation of a new release, based on the changes contained in that release. prior to the creation of a new release, based on the changes contained in that release.
# Version 103.4.3
- Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations) - Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
- Performance improvements for RSA for modern Arm64 microarchitecture
# Version 103.4.2 # Version 103.4.2

Просмотреть файл

@ -331,7 +331,7 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
mov X_16, X_0 // Store the pMod pointer mov X_16, X_0 // Store the pMod pointer
mov X_17, X_1 // Store the pSrc pointer mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0 mov X_7, xzr // Set hc to 0
// //
// Main loop // Main loop
@ -340,7 +340,7 @@ LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
ldr X_8, [X_1, #24] // Load 1 word from pSrc ldr X_8, [X_1, #24] // Load 1 word from pSrc
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0 mov X_12, xzr // Set c to 0
LABEL(SymCryptFdef369MontgomeryReduceAsmInner) LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #24]! // pMod[j] ldp X_10, X_11, [X_0, #24]! // pMod[j]

Просмотреть файл

@ -308,13 +308,14 @@ MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)] ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j] mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
adds scratch1, scratch1, src_carry // Adding the previous word
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j] umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
adds scratch1, scratch1, scratch2 // Add the word from the destination adds scratch1, scratch1, scratch2 // Add the word from the destination
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
adds scratch1, scratch1, src_carry // Adding the previous word
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
str scratch1, [dst_reg, #8*index] // Store to destination str scratch1, [dst_reg, #8*index] // Store to destination
MACRO_END() MACRO_END()
@ -384,9 +385,9 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
// //
// First iteration of main loop (no adding of previous values from pDst) // First iteration of main loop (no adding of previous values from pDst)
// //
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 mov X_12, xzr // set X_12 = 0
ldr X_6, [X_0] // load the first word from pSrc1 ldr X_6, [X_0] // load the first word from pSrc1
str X_12, [X_4] // store 0 for the first word str xzr, [X_4] // store 0 for the first word
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1 b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
@ -423,7 +424,8 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
mov X_2, X_0 // set pSrc mov X_2, X_0 // set pSrc
mov X_4, X_5 // set pDst mov X_4, X_5 // set pDst
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0 mov X_11, xzr // set X_11 = 0
mov X_12, xzr // set X_12 = 0
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
// Cyclic counter and jump logic // Cyclic counter and jump logic
@ -448,16 +450,16 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
mov X_3, X_14 // set the new digit counter mov X_3, X_14 // set the new digit counter
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0) LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1) LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2) LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3) LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10 SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
sub X_3, X_3, #1 // move one digit up sub X_3, X_3, #1 // move one digit up
add X_2, X_2, #32 add X_2, X_2, #32
@ -470,8 +472,7 @@ LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
sub X_1, X_1, #1 // move one word up sub X_1, X_1, #1 // move one word up
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
ands X_12, X_12, xzr // Setting X_12 = 0 str xzr, [X_5, #40] // Store 0 to destination for the top word
str X_12, [X_5, #40] // Store 0 to destination for the top word
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Second Pass - Shifting all results 1 bit left // Second Pass - Shifting all results 1 bit left
@ -1240,136 +1241,156 @@ FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefModSquareMontgomeryP384Asm))
// X_0 = pMod (moving forward one *digit* every inner loop) // X_0 = pMod (moving forward one *digit* every inner loop)
// X_1 = pSrc (moving forward one *digit* every inner loop) // X_1 = pSrc (moving forward one *digit* every inner loop)
// X_2 = pDst (used only in the end for subtract / result) // X_2 = pDst (used only in the end for subtract / result)
// X_3 = digit count of pSrc and pMod // X_3 = byte count of pSrc and pMod (sometimes byte count - 1 digit of bytes)
// X_4 = word count of pSrc // X_4 = word count of pSrc
// X_5 = Inv64 of the modulus // X_5 = Inv64 of the modulus
// X_6 = m = pSrc[i]*Inv64 // X_6 = m = pSrc[i]*Inv64
// X_7 = hc = high carry variable // X_7 = hc = high carry variable
// X_8, X_9 = Current words loaded in pairs from pSrc // X_8-X_19 = temporaries
// X_10, X_11 = Current words loaded in pairs from pMod // X_20 = c
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies // X_21 = Stored byte count of pSrc
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
// X_14 = Temporary intermediate result
// X_15 = Stored digit count of pSrc
// X_16 = Stored pMod pointer
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18) FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 22)
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits ldr W_4, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
ldr X_5, [X_0, #SymCryptModulusInv64OffsetArm64] // Inv64 of modulus ldr X_5, [X_0, #SymCryptModulusInv64OffsetArm64] // Inv64 of modulus
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
lsl X_4, X_3, #2 // Multiply by 4 to get the number of words lsl X_3, X_4, #5 // Multiply by 32 to get the number of bytes
lsl X_4, X_4, #2 // Multiply by 4 to get the number of words
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads sub X_3, X_3, #32 // offset the byte count to make it easy to restore pointers
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
mov X_15, X_3 // Store the digit count for later mov X_7, xzr // Set hc to 0
mov X_16, X_0 // Store the pMod pointer mov X_21, X_3 // Store the byte count for later
mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0
// //
// Main loop // Main loop
// //
LABEL(SymCryptFdefMontgomeryReduceAsmOuter) LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
ldr X_8, [X_1, #32] // Load 1 word from pSrc ldp X_16, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m mul X_6, X_16, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0 ldp X_8, X_9, [X_0] // pMod[0], pMod[1]
umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[0]
subs xzr, X_16, #1 // Subtract 1 from pSrc[i+0] (generate a carry iff pSrc[i] is non-zero)
mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[1]
umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[1]
adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
ldp X_10, X_11, [X_0, #16] // pMod[2], pMod[3]
mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[2]
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[2]
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
ldp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[3]
umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[3]
adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
adc X_20, X_11, xzr // Add the carry if any
// ( X_20, X_10, X_9, X_8, ___ ) = (( pMod[3], pMod[2], pMod[1], pMod[0] ) * m)
// We do not compute this --^ because (m*pMod[0] + pSrc[i]) == 0. We already accounted for the carry above.
adds X_17, X_17, X_8 // Adding pSrc[i+1]
adcs X_18, X_18, X_9 // Adding pSrc[i+2]
adcs X_19, X_19, X_10 // Adding pSrc[i+3]
stp xzr, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
stp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
cbz X_3, SymCryptFdefMontgomeryReduceAsmInnerEnd
LABEL(SymCryptFdefMontgomeryReduceAsmInner) LABEL(SymCryptFdefMontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #32]! // pMod[j] // Invariant - X_20 (c) + carry flag is a value in the range [0, 2^64-1]
ldp X_8, X_9, [X_1, #32]! // pSrc[j] // We could always adc X_20, X_20, xzr before loop entry, but this is a wasted instruction
// vs. adcs with X_20 below
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m ldp X_8, X_9, [X_0, #32]! // pMod[j+0], pMod[j+1]
adds X_14, X_14, X_8 // Adding pSrc[j] mul X_12, X_6, X_8 // Bits <63:0> of m*pMod[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[j]
adc X_13, X_13, xzr // Add the carry if any (***) adcs X_12, X_20, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
// ***: These cannot produce extra carry as the maximum is
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str X_12, [X_1] // pSrc[j] = (UINT64) c
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[j+1]
adds X_14, X_14, X_9 // Adding pSrc[j] umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[j+1]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m ldp X_16, X_17, [X_1, #32]! // pSrc[i+j+0], pSrc[i+j+1]
adc X_12, X_12, xzr // Add the carry if any (***) adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #8] // pSrc[j] = (UINT64) c
ldp X_10, X_11, [X_0, #16] // pMod[j] ldp X_10, X_11, [X_0, #16] // pMod[j+2], pMod[j+3]
ldp X_8, X_9, [X_1, #16] // pSrc[j] mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[j+2]
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[j+2]
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m ldp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
adds X_14, X_14, X_8 // Adding pSrc[j] mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[j+3]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[j+3]
adc X_13, X_13, xzr // Add the carry if any (***) adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m adc X_20, X_11, xzr // Add the carry if any
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
adc X_12, X_12, xzr // Add the carry if any (***)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #24] // pSrc[j] = (UINT64) c
subs X_3, X_3, #1 // Move one digit up // ( X_20, X_10, X_9, X_8, X_12 ) = (( pMod[j+3], pMod[j+2], pMod[j+1], pMod[j+0] ) * m) + c
bne SymCryptFdefMontgomeryReduceAsmInner
adds X_16, X_16, X_12 // Adding pSrc[i+j+0]
adcs X_17, X_17, X_8 // Adding pSrc[i+j+1]
adcs X_18, X_18, X_9 // Adding pSrc[i+j+2]
adcs X_19, X_19, X_10 // Adding pSrc[i+j+3]
stp X_16, X_17, [X_1] // pSrc[i+j+0], pSrc[i+j+1]
stp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
sub X_3, X_3, #32 // Move one digit up
cbnz X_3, SymCryptFdefMontgomeryReduceAsmInner
LABEL(SymCryptFdefMontgomeryReduceAsmInnerEnd)
ldr X_8, [X_1, #32] // pSrc[nWords] ldr X_8, [X_1, #32] // pSrc[nWords]
adds X_12, X_12, X_8 // c + pSrc[nWords] adcs X_20, X_20, X_7 // c + hc
adc X_13, xzr, xzr // Add the carry if any adc X_7, xzr, xzr // Add the carry if any
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc adds X_20, X_20, X_8 // c + hc + pSrc[nWords]
adc X_7, X_13, xzr // Add the carry if any and store into hc adc X_7, X_7, xzr // Add the carry if any and store into hc
str X_12, [X_1, #32] // pSrc[nWords] = c str X_20, [X_1, #32] // pSrc[nWords] = c
subs X_4, X_4, #1 // Move one word up subs X_4, X_4, #1 // Move one word up
add X_17, X_17, #8 // Move stored pSrc pointer one word up sub X_0, X_0, X_21 // Restore pMod pointer (subtract byte count)
mov X_0, X_16 // Restore pMod pointer sub X_1, X_1, X_21 // Restore pSrc pointer (subtract byte count)
mov X_1, X_17 // Restore pSrc pointer add X_1, X_1, #8 // Move pSrc pointer one word up
mov X_3, X_15 // Restore the digit counter mov X_3, X_21 // Restore the byte counter
bne SymCryptFdefMontgomeryReduceAsmOuter bne SymCryptFdefMontgomeryReduceAsmOuter
// //
// Subtraction // Subtraction
// //
add X_3, X_3, #32 // restore the full byte count for loops using unconditional pre-indexing
mov X_21, X_3 // Store the byte count for later
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
mov X_14, X_2 // Store pDst pointer mov X_14, X_2 // Store pDst pointer
// Prepare the pointers for subtract // Prepare the pointers for subtract
mov X_0, X_17 // pSrc
mov X_1, X_16 // pMod
mov X_10, X_7 // X_10 = hc mov X_10, X_7 // X_10 = hc
mov X_3, X_15 // Restore the digit counter
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow) subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop) LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
sub X_3, X_3, #1 // Decrement the digit count by one sub X_3, X_3, #32 // Decrement the byte count by 32
// borrow is in the carry flag (flipped) // borrow is in the carry flag (flipped)
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1 ldp X_4, X_6, [X_1, #32]! // Load two words of pSrc
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2 ldp X_5, X_7, [X_0, #32]! // Load two words of pMod
sbcs X_4, X_4, X_5 sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7 sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #32]! // Store the result in the destination stp X_4, X_6, [X_2, #32]! // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1 ldp X_4, X_6, [X_1, #16] // Load two words of pSrc
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2 ldp X_5, X_7, [X_0, #16] // Load two words of pMod
sbcs X_4, X_4, X_5 sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7 sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination stp X_4, X_6, [X_2, #16] // Store the result in the destination
@ -1381,14 +1402,14 @@ LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
orr X_11, X_10, X_0 // X_11 = hc|d orr X_11, X_10, X_0 // X_11 = hc|d
// Prepare the pointers for masked copy // Prepare the pointers for masked copy
mov X_0, X_17 // pSrc sub X_0, X_1, X_21 // Restore pSrc pointer (subtract byte count)
mov X_1, X_14 // pDst mov X_1, X_14 // pDst
mov X_2, X_15 // Restore the digit counter mov X_2, X_21 // Restore the byte counter
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow) subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop) LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
sub X_2, X_2, #1 // decrement the digit count by one sub X_2, X_2, #32 // decrement the byte count by 32
ldp X_4, X_6, [X_0, #32]! // Load two words of the source ldp X_4, X_6, [X_0, #32]! // Load two words of the source
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination ldp X_5, X_7, [X_1, #32]! // Load two words of the destination

Просмотреть файл

@ -645,26 +645,6 @@ MAPPING_ARM32_AAPCS32 = {
15:ARM32_R15, # PC 15:ARM32_R15, # PC
} }
def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
return prologue
def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
epilogue += " ret\n"
return epilogue
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count): def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
assert(not stack_alloc_size and not xmm_reg_count) assert(not stack_alloc_size and not xmm_reg_count)
prologue = "" prologue = ""
@ -697,7 +677,7 @@ def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n" epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
return epilogue return epilogue
def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count): def gen_prologue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = "" prologue = ""
if reg_count > self.volatile_registers: if reg_count > self.volatile_registers:
@ -717,7 +697,7 @@ def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
return prologue return prologue
def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count): def gen_epilogue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = "" epilogue = ""
if reg_count > self.volatile_registers: if reg_count > self.volatile_registers:
@ -741,17 +721,63 @@ def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
return epilogue return epilogue
def gen_prologue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
prologue += " stp fp, lr, [sp, #-%d]! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
stack_offset = 16
for i in range(self.volatile_registers, reg_count-1, 2):
prologue += " stp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
stack_offset += 16
if registers_to_spill % 2 == 1:
prologue += " str X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
return prologue
def gen_epilogue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
stack_offset = required_stack_space-16
if registers_to_spill % 2 == 1:
epilogue += " ldr X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
stack_offset -= 16
for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
epilogue += " ldp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
stack_offset -= 16
epilogue += " ldp fp, lr, [sp], #%d // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
epilogue += " ret\n"
return epilogue
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False): def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
logging.error("symcryptasm currently does not support memory slots for arm64!") logging.error("symcryptasm currently does not support memory slots for arm64!")
exit(1) exit(1)
CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention( CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64 = CallingConvention(
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18, "arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64) gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM64_AAPCS64_GAS = CallingConvention(
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
gen_prologue_arm64_gas, gen_epilogue_arm64_gas, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention( CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16, "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64) gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention( CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4, "arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
@ -1194,7 +1220,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
elif architecture == "arm64" and calling_convention == "aapcs64": elif architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64 normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_GAS
mul_calling_convention = None mul_calling_convention = None
nested_calling_convention = None nested_calling_convention = None
elif architecture == "arm" and calling_convention == "aapcs32": elif architecture == "arm" and calling_convention == "aapcs32":
@ -1203,7 +1229,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
nested_calling_convention = None nested_calling_convention = None
elif assembler == "armasm64": elif assembler == "armasm64":
if architecture == "arm64" and calling_convention == "aapcs64": if architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64 normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64
mul_calling_convention = None mul_calling_convention = None
nested_calling_convention = None nested_calling_convention = None
elif architecture == "arm64" and calling_convention == "arm64ec": elif architecture == "arm64" and calling_convention == "arm64ec":

Просмотреть файл

@ -1 +1 @@
{ "major": 103, "minor": 4, "patch": 2 } { "major": 103, "minor": 4, "patch": 3 }