Merged PR 11150425: Arm64 server perf work

## Description:

+ Improve `SymCryptFdefMontgomeryReduceAsm`
  + Reduce instruction count in the inner loop - remove superfluous `adc` with zero
  + Special case first iteration of the reduction loop to further reduce instruction count and multiplication uops
  + For ease of phrasing used non-volatile registers in aapcs64 assembly for the first time, and had to slightly extend SymCryptAsm processor script for this.
+ Improve `SymCryptFdefRawSquareAsm` by tweaking to reduce undue dependencies.

+ More room for improvements in follow-on PR, but checking in what we have to get improvements before GE deadline.

## Admin Checklist:
- [X] You have updated documentation in symcrypt.h to reflect any changes in behavior
- [X] You have updated CHANGELOG.md to reflect any changes in behavior
- [X] You have updated symcryptunittest to exercise any new functionality
- [X] If you have introduced any symbols in symcrypt.h you have updated production and test dynamic export symbols (exports.ver / exports.def / symcrypt.src) and tested the updated dynamic modules with symcryptunittest
- [X] If you have introduced functionality that varies based on CPU features, you have manually tested with and without relevant features
- [X] If you have made significant changes to a particular algorithm, you have checked that performance numbers reported by symcryptunittest are in line with expectations
- [X] If you have added new algorithms/modes, you have updated the status indicator text for the associated modules if necessary
This commit is contained in:
Samuel Lee 2024-07-26 02:18:13 +00:00
Родитель 5e521761ef
Коммит 982858166c
5 изменённых файлов: 171 добавлений и 121 удалений

Просмотреть файл

@ -3,7 +3,10 @@
New changes will be listed here as they are developed. The version number is determined
prior to the creation of a new release, based on the changes contained in that release.
# Version 103.4.3
- Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
- Performance improvements for RSA for modern Arm64 microarchitecture
# Version 103.4.2

Просмотреть файл

@ -331,7 +331,7 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
mov X_16, X_0 // Store the pMod pointer
mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0
mov X_7, xzr // Set hc to 0
//
// Main loop
@ -340,7 +340,7 @@ LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
ldr X_8, [X_1, #24] // Load 1 word from pSrc
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0
mov X_12, xzr // Set c to 0
LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #24]! // pMod[j]

Просмотреть файл

@ -308,13 +308,14 @@ MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
adds scratch1, scratch1, src_carry // Adding the previous word
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
adds scratch1, scratch1, scratch2 // Add the word from the destination
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
adds scratch1, scratch1, src_carry // Adding the previous word
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
str scratch1, [dst_reg, #8*index] // Store to destination
MACRO_END()
@ -384,9 +385,9 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
//
// First iteration of main loop (no adding of previous values from pDst)
//
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
mov X_12, xzr // set X_12 = 0
ldr X_6, [X_0] // load the first word from pSrc1
str X_12, [X_4] // store 0 for the first word
str xzr, [X_4] // store 0 for the first word
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
@ -423,7 +424,8 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
mov X_2, X_0 // set pSrc
mov X_4, X_5 // set pDst
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
mov X_11, xzr // set X_11 = 0
mov X_12, xzr // set X_12 = 0
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
// Cyclic counter and jump logic
@ -448,16 +450,16 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
mov X_3, X_14 // set the new digit counter
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
sub X_3, X_3, #1 // move one digit up
add X_2, X_2, #32
@ -470,8 +472,7 @@ LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
sub X_1, X_1, #1 // move one word up
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
ands X_12, X_12, xzr // Setting X_12 = 0
str X_12, [X_5, #40] // Store 0 to destination for the top word
str xzr, [X_5, #40] // Store 0 to destination for the top word
////////////////////////////////////////////////////////////////
// Second Pass - Shifting all results 1 bit left
@ -1240,136 +1241,156 @@ FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefModSquareMontgomeryP384Asm))
// X_0 = pMod (moving forward one *digit* every inner loop)
// X_1 = pSrc (moving forward one *digit* every inner loop)
// X_2 = pDst (used only in the end for subtract / result)
// X_3 = digit count of pSrc and pMod
// X_3 = byte count of pSrc and pMod (sometimes byte count - 1 digit of bytes)
// X_4 = word count of pSrc
// X_5 = Inv64 of the modulus
// X_6 = m = pSrc[i]*Inv64
// X_7 = hc = high carry variable
// X_8, X_9 = Current words loaded in pairs from pSrc
// X_10, X_11 = Current words loaded in pairs from pMod
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
// X_14 = Temporary intermediate result
// X_15 = Stored digit count of pSrc
// X_16 = Stored pMod pointer
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
// X_8-X_19 = temporaries
// X_20 = c
// X_21 = Stored byte count of pSrc
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 22)
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
ldr W_4, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
ldr X_5, [X_0, #SymCryptModulusInv64OffsetArm64] // Inv64 of modulus
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
lsl X_4, X_3, #2 // Multiply by 4 to get the number of words
lsl X_3, X_4, #5 // Multiply by 32 to get the number of bytes
lsl X_4, X_4, #2 // Multiply by 4 to get the number of words
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
sub X_3, X_3, #32 // offset the byte count to make it easy to restore pointers
mov X_15, X_3 // Store the digit count for later
mov X_16, X_0 // Store the pMod pointer
mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0
mov X_7, xzr // Set hc to 0
mov X_21, X_3 // Store the byte count for later
//
// Main loop
//
LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
ldr X_8, [X_1, #32] // Load 1 word from pSrc
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
ldp X_16, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
mul X_6, X_16, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0
ldp X_8, X_9, [X_0] // pMod[0], pMod[1]
umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[0]
subs xzr, X_16, #1 // Subtract 1 from pSrc[i+0] (generate a carry iff pSrc[i] is non-zero)
mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[1]
umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[1]
adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
ldp X_10, X_11, [X_0, #16] // pMod[2], pMod[3]
mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[2]
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[2]
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
ldp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[3]
umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[3]
adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
adc X_20, X_11, xzr // Add the carry if any
// ( X_20, X_10, X_9, X_8, ___ ) = (( pMod[3], pMod[2], pMod[1], pMod[0] ) * m)
// We do not compute this --^ because (m*pMod[0] + pSrc[i]) == 0. We already accounted for the carry above.
adds X_17, X_17, X_8 // Adding pSrc[i+1]
adcs X_18, X_18, X_9 // Adding pSrc[i+2]
adcs X_19, X_19, X_10 // Adding pSrc[i+3]
stp xzr, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
stp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
cbz X_3, SymCryptFdefMontgomeryReduceAsmInnerEnd
LABEL(SymCryptFdefMontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #32]! // pMod[j]
ldp X_8, X_9, [X_1, #32]! // pSrc[j]
// Invariant - X_20 (c) + carry flag is a value in the range [0, 2^64-1]
// We could always adc X_20, X_20, xzr before loop entry, but this is a wasted instruction
// vs. adcs with X_20 below
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
// ***: These cannot produce extra carry as the maximum is
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str X_12, [X_1] // pSrc[j] = (UINT64) c
ldp X_8, X_9, [X_0, #32]! // pMod[j+0], pMod[j+1]
mul X_12, X_6, X_8 // Bits <63:0> of m*pMod[j]
umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[j]
adcs X_12, X_20, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
adc X_12, X_12, xzr // Add the carry if any (***)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #8] // pSrc[j] = (UINT64) c
mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[j+1]
umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[j+1]
ldp X_16, X_17, [X_1, #32]! // pSrc[i+j+0], pSrc[i+j+1]
adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
ldp X_10, X_11, [X_0, #16] // pMod[j]
ldp X_8, X_9, [X_1, #16] // pSrc[j]
ldp X_10, X_11, [X_0, #16] // pMod[j+2], pMod[j+3]
mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[j+2]
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[j+2]
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c
ldp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[j+3]
umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[j+3]
adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
adc X_12, X_12, xzr // Add the carry if any (***)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #24] // pSrc[j] = (UINT64) c
adc X_20, X_11, xzr // Add the carry if any
subs X_3, X_3, #1 // Move one digit up
bne SymCryptFdefMontgomeryReduceAsmInner
// ( X_20, X_10, X_9, X_8, X_12 ) = (( pMod[j+3], pMod[j+2], pMod[j+1], pMod[j+0] ) * m) + c
adds X_16, X_16, X_12 // Adding pSrc[i+j+0]
adcs X_17, X_17, X_8 // Adding pSrc[i+j+1]
adcs X_18, X_18, X_9 // Adding pSrc[i+j+2]
adcs X_19, X_19, X_10 // Adding pSrc[i+j+3]
stp X_16, X_17, [X_1] // pSrc[i+j+0], pSrc[i+j+1]
stp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
sub X_3, X_3, #32 // Move one digit up
cbnz X_3, SymCryptFdefMontgomeryReduceAsmInner
LABEL(SymCryptFdefMontgomeryReduceAsmInnerEnd)
ldr X_8, [X_1, #32] // pSrc[nWords]
adds X_12, X_12, X_8 // c + pSrc[nWords]
adc X_13, xzr, xzr // Add the carry if any
adcs X_20, X_20, X_7 // c + hc
adc X_7, xzr, xzr // Add the carry if any
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
adc X_7, X_13, xzr // Add the carry if any and store into hc
adds X_20, X_20, X_8 // c + hc + pSrc[nWords]
adc X_7, X_7, xzr // Add the carry if any and store into hc
str X_12, [X_1, #32] // pSrc[nWords] = c
str X_20, [X_1, #32] // pSrc[nWords] = c
subs X_4, X_4, #1 // Move one word up
add X_17, X_17, #8 // Move stored pSrc pointer one word up
mov X_0, X_16 // Restore pMod pointer
mov X_1, X_17 // Restore pSrc pointer
sub X_0, X_0, X_21 // Restore pMod pointer (subtract byte count)
sub X_1, X_1, X_21 // Restore pSrc pointer (subtract byte count)
add X_1, X_1, #8 // Move pSrc pointer one word up
mov X_3, X_15 // Restore the digit counter
mov X_3, X_21 // Restore the byte counter
bne SymCryptFdefMontgomeryReduceAsmOuter
//
// Subtraction
//
add X_3, X_3, #32 // restore the full byte count for loops using unconditional pre-indexing
mov X_21, X_3 // Store the byte count for later
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
mov X_14, X_2 // Store pDst pointer
// Prepare the pointers for subtract
mov X_0, X_17 // pSrc
mov X_1, X_16 // pMod
mov X_10, X_7 // X_10 = hc
mov X_3, X_15 // Restore the digit counter
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
sub X_3, X_3, #1 // Decrement the digit count by one
sub X_3, X_3, #32 // Decrement the byte count by 32
// borrow is in the carry flag (flipped)
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
ldp X_4, X_6, [X_1, #32]! // Load two words of pSrc
ldp X_5, X_7, [X_0, #32]! // Load two words of pMod
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
ldp X_4, X_6, [X_1, #16] // Load two words of pSrc
ldp X_5, X_7, [X_0, #16] // Load two words of pMod
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
@ -1381,14 +1402,14 @@ LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
orr X_11, X_10, X_0 // X_11 = hc|d
// Prepare the pointers for masked copy
mov X_0, X_17 // pSrc
sub X_0, X_1, X_21 // Restore pSrc pointer (subtract byte count)
mov X_1, X_14 // pDst
mov X_2, X_15 // Restore the digit counter
mov X_2, X_21 // Restore the byte counter
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
sub X_2, X_2, #1 // decrement the digit count by one
sub X_2, X_2, #32 // decrement the byte count by 32
ldp X_4, X_6, [X_0, #32]! // Load two words of the source
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination

Просмотреть файл

@ -645,26 +645,6 @@ MAPPING_ARM32_AAPCS32 = {
15:ARM32_R15, # PC
}
def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
return prologue
def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
epilogue += " ret\n"
return epilogue
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
assert(not stack_alloc_size and not xmm_reg_count)
prologue = ""
@ -697,7 +677,7 @@ def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
return epilogue
def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
def gen_prologue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
if reg_count > self.volatile_registers:
@ -717,7 +697,7 @@ def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
return prologue
def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
def gen_epilogue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
@ -741,17 +721,63 @@ def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
return epilogue
def gen_prologue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
prologue += " stp fp, lr, [sp, #-%d]! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
stack_offset = 16
for i in range(self.volatile_registers, reg_count-1, 2):
prologue += " stp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
stack_offset += 16
if registers_to_spill % 2 == 1:
prologue += " str X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
return prologue
def gen_epilogue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
stack_offset = required_stack_space-16
if registers_to_spill % 2 == 1:
epilogue += " ldr X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
stack_offset -= 16
for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
epilogue += " ldp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
stack_offset -= 16
epilogue += " ldp fp, lr, [sp], #%d // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
epilogue += " ret\n"
return epilogue
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
logging.error("symcryptasm currently does not support memory slots for arm64!")
exit(1)
CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64 = CallingConvention(
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM64_AAPCS64_GAS = CallingConvention(
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
gen_prologue_arm64_gas, gen_epilogue_arm64_gas, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
@ -1194,7 +1220,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
elif architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_GAS
mul_calling_convention = None
nested_calling_convention = None
elif architecture == "arm" and calling_convention == "aapcs32":
@ -1203,7 +1229,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
nested_calling_convention = None
elif assembler == "armasm64":
if architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64
mul_calling_convention = None
nested_calling_convention = None
elif architecture == "arm64" and calling_convention == "arm64ec":

Просмотреть файл

@ -1 +1 @@
{ "major": 103, "minor": 4, "patch": 2 }
{ "major": 103, "minor": 4, "patch": 3 }