зеркало из https://github.com/microsoft/SymCrypt.git
Merged PR 11150425: Arm64 server perf work
## Description: + Improve `SymCryptFdefMontgomeryReduceAsm` + Reduce instruction count in the inner loop - remove superfluous `adc` with zero + Special case first iteration of the reduction loop to further reduce instruction count and multiplication uops + For ease of phrasing used non-volatile registers in aapcs64 assembly for the first time, and had to slightly extend SymCryptAsm processor script for this. + Improve `SymCryptFdefRawSquareAsm` by tweaking to reduce undue dependencies. + More room for improvements in follow-on PR, but checking in what we have to get improvements before GE deadline. ## Admin Checklist: - [X] You have updated documentation in symcrypt.h to reflect any changes in behavior - [X] You have updated CHANGELOG.md to reflect any changes in behavior - [X] You have updated symcryptunittest to exercise any new functionality - [X] If you have introduced any symbols in symcrypt.h you have updated production and test dynamic export symbols (exports.ver / exports.def / symcrypt.src) and tested the updated dynamic modules with symcryptunittest - [X] If you have introduced functionality that varies based on CPU features, you have manually tested with and without relevant features - [X] If you have made significant changes to a particular algorithm, you have checked that performance numbers reported by symcryptunittest are in line with expectations - [X] If you have added new algorithms/modes, you have updated the status indicator text for the associated modules if necessary
This commit is contained in:
Родитель
5e521761ef
Коммит
982858166c
|
@ -3,7 +3,10 @@
|
||||||
New changes will be listed here as they are developed. The version number is determined
|
New changes will be listed here as they are developed. The version number is determined
|
||||||
prior to the creation of a new release, based on the changes contained in that release.
|
prior to the creation of a new release, based on the changes contained in that release.
|
||||||
|
|
||||||
|
# Version 103.4.3
|
||||||
|
|
||||||
- Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
|
- Added preliminary support for macOS (static libraries and unit tests only, no ASM optimizations)
|
||||||
|
- Performance improvements for RSA for modern Arm64 microarchitecture
|
||||||
|
|
||||||
# Version 103.4.2
|
# Version 103.4.2
|
||||||
|
|
||||||
|
|
|
@ -331,7 +331,7 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
|
||||||
mov X_16, X_0 // Store the pMod pointer
|
mov X_16, X_0 // Store the pMod pointer
|
||||||
mov X_17, X_1 // Store the pSrc pointer
|
mov X_17, X_1 // Store the pSrc pointer
|
||||||
|
|
||||||
and X_7, X_7, xzr // Set hc to 0
|
mov X_7, xzr // Set hc to 0
|
||||||
|
|
||||||
//
|
//
|
||||||
// Main loop
|
// Main loop
|
||||||
|
@ -340,7 +340,7 @@ LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
|
||||||
ldr X_8, [X_1, #24] // Load 1 word from pSrc
|
ldr X_8, [X_1, #24] // Load 1 word from pSrc
|
||||||
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
||||||
|
|
||||||
and X_12, X_12, xzr // Set c to 0
|
mov X_12, xzr // Set c to 0
|
||||||
|
|
||||||
LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
|
LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
|
||||||
ldp X_10, X_11, [X_0, #24]! // pMod[j]
|
ldp X_10, X_11, [X_0, #24]! // pMod[j]
|
||||||
|
|
|
@ -308,13 +308,14 @@ MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_
|
||||||
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
|
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
|
||||||
|
|
||||||
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
|
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
|
||||||
adds scratch1, scratch1, src_carry // Adding the previous word
|
|
||||||
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
|
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
|
||||||
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
|
||||||
|
|
||||||
adds scratch1, scratch1, scratch2 // Add the word from the destination
|
adds scratch1, scratch1, scratch2 // Add the word from the destination
|
||||||
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
||||||
|
|
||||||
|
adds scratch1, scratch1, src_carry // Adding the previous word
|
||||||
|
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
||||||
|
|
||||||
str scratch1, [dst_reg, #8*index] // Store to destination
|
str scratch1, [dst_reg, #8*index] // Store to destination
|
||||||
|
|
||||||
MACRO_END()
|
MACRO_END()
|
||||||
|
@ -384,9 +385,9 @@ FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
|
||||||
//
|
//
|
||||||
// First iteration of main loop (no adding of previous values from pDst)
|
// First iteration of main loop (no adding of previous values from pDst)
|
||||||
//
|
//
|
||||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
mov X_12, xzr // set X_12 = 0
|
||||||
ldr X_6, [X_0] // load the first word from pSrc1
|
ldr X_6, [X_0] // load the first word from pSrc1
|
||||||
str X_12, [X_4] // store 0 for the first word
|
str xzr, [X_4] // store 0 for the first word
|
||||||
|
|
||||||
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
||||||
|
|
||||||
|
@ -423,7 +424,8 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
|
||||||
mov X_2, X_0 // set pSrc
|
mov X_2, X_0 // set pSrc
|
||||||
mov X_4, X_5 // set pDst
|
mov X_4, X_5 // set pDst
|
||||||
|
|
||||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
mov X_11, xzr // set X_11 = 0
|
||||||
|
mov X_12, xzr // set X_12 = 0
|
||||||
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
|
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
|
||||||
|
|
||||||
// Cyclic counter and jump logic
|
// Cyclic counter and jump logic
|
||||||
|
@ -448,16 +450,16 @@ LABEL(SymCryptFdefRawSquareAsmOuterLoop)
|
||||||
mov X_3, X_14 // set the new digit counter
|
mov X_3, X_14 // set the new digit counter
|
||||||
|
|
||||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
|
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
|
||||||
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
|
||||||
|
|
||||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
|
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
|
||||||
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
|
||||||
|
|
||||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
|
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
|
||||||
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_11, X_7, X_9, X_10
|
||||||
|
|
||||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
|
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
|
||||||
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_11, X_12, X_7, X_9, X_10
|
||||||
|
|
||||||
sub X_3, X_3, #1 // move one digit up
|
sub X_3, X_3, #1 // move one digit up
|
||||||
add X_2, X_2, #32
|
add X_2, X_2, #32
|
||||||
|
@ -470,8 +472,7 @@ LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
|
||||||
sub X_1, X_1, #1 // move one word up
|
sub X_1, X_1, #1 // move one word up
|
||||||
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
|
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
|
||||||
|
|
||||||
ands X_12, X_12, xzr // Setting X_12 = 0
|
str xzr, [X_5, #40] // Store 0 to destination for the top word
|
||||||
str X_12, [X_5, #40] // Store 0 to destination for the top word
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Second Pass - Shifting all results 1 bit left
|
// Second Pass - Shifting all results 1 bit left
|
||||||
|
@ -1240,136 +1241,156 @@ FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefModSquareMontgomeryP384Asm))
|
||||||
// X_0 = pMod (moving forward one *digit* every inner loop)
|
// X_0 = pMod (moving forward one *digit* every inner loop)
|
||||||
// X_1 = pSrc (moving forward one *digit* every inner loop)
|
// X_1 = pSrc (moving forward one *digit* every inner loop)
|
||||||
// X_2 = pDst (used only in the end for subtract / result)
|
// X_2 = pDst (used only in the end for subtract / result)
|
||||||
// X_3 = digit count of pSrc and pMod
|
// X_3 = byte count of pSrc and pMod (sometimes byte count - 1 digit of bytes)
|
||||||
// X_4 = word count of pSrc
|
// X_4 = word count of pSrc
|
||||||
// X_5 = Inv64 of the modulus
|
// X_5 = Inv64 of the modulus
|
||||||
// X_6 = m = pSrc[i]*Inv64
|
// X_6 = m = pSrc[i]*Inv64
|
||||||
// X_7 = hc = high carry variable
|
// X_7 = hc = high carry variable
|
||||||
// X_8, X_9 = Current words loaded in pairs from pSrc
|
// X_8-X_19 = temporaries
|
||||||
// X_10, X_11 = Current words loaded in pairs from pMod
|
// X_20 = c
|
||||||
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
|
// X_21 = Stored byte count of pSrc
|
||||||
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
|
|
||||||
// X_14 = Temporary intermediate result
|
|
||||||
// X_15 = Stored digit count of pSrc
|
|
||||||
// X_16 = Stored pMod pointer
|
|
||||||
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
|
|
||||||
|
|
||||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
|
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 22)
|
||||||
|
|
||||||
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
|
ldr W_4, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
|
||||||
ldr X_5, [X_0, #SymCryptModulusInv64OffsetArm64] // Inv64 of modulus
|
ldr X_5, [X_0, #SymCryptModulusInv64OffsetArm64] // Inv64 of modulus
|
||||||
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
|
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
|
||||||
|
|
||||||
lsl X_4, X_3, #2 // Multiply by 4 to get the number of words
|
lsl X_3, X_4, #5 // Multiply by 32 to get the number of bytes
|
||||||
|
lsl X_4, X_4, #2 // Multiply by 4 to get the number of words
|
||||||
|
|
||||||
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
|
sub X_3, X_3, #32 // offset the byte count to make it easy to restore pointers
|
||||||
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
|
|
||||||
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
|
|
||||||
|
|
||||||
mov X_15, X_3 // Store the digit count for later
|
mov X_7, xzr // Set hc to 0
|
||||||
mov X_16, X_0 // Store the pMod pointer
|
mov X_21, X_3 // Store the byte count for later
|
||||||
mov X_17, X_1 // Store the pSrc pointer
|
|
||||||
|
|
||||||
and X_7, X_7, xzr // Set hc to 0
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Main loop
|
// Main loop
|
||||||
//
|
//
|
||||||
LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
|
LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
|
||||||
ldr X_8, [X_1, #32] // Load 1 word from pSrc
|
ldp X_16, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
|
||||||
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
mul X_6, X_16, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
||||||
|
|
||||||
and X_12, X_12, xzr // Set c to 0
|
ldp X_8, X_9, [X_0] // pMod[0], pMod[1]
|
||||||
|
umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[0]
|
||||||
|
subs xzr, X_16, #1 // Subtract 1 from pSrc[i+0] (generate a carry iff pSrc[i] is non-zero)
|
||||||
|
|
||||||
|
mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[1]
|
||||||
|
umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[1]
|
||||||
|
adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
|
|
||||||
|
ldp X_10, X_11, [X_0, #16] // pMod[2], pMod[3]
|
||||||
|
mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[2]
|
||||||
|
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[2]
|
||||||
|
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
|
|
||||||
|
ldp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
|
||||||
|
mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[3]
|
||||||
|
umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[3]
|
||||||
|
adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
|
|
||||||
|
adc X_20, X_11, xzr // Add the carry if any
|
||||||
|
|
||||||
|
// ( X_20, X_10, X_9, X_8, ___ ) = (( pMod[3], pMod[2], pMod[1], pMod[0] ) * m)
|
||||||
|
// We do not compute this --^ because (m*pMod[0] + pSrc[i]) == 0. We already accounted for the carry above.
|
||||||
|
|
||||||
|
adds X_17, X_17, X_8 // Adding pSrc[i+1]
|
||||||
|
adcs X_18, X_18, X_9 // Adding pSrc[i+2]
|
||||||
|
adcs X_19, X_19, X_10 // Adding pSrc[i+3]
|
||||||
|
|
||||||
|
stp xzr, X_17, [X_1] // pSrc[i+0], pSrc[i+1]
|
||||||
|
stp X_18, X_19, [X_1, #16] // pSrc[i+2], pSrc[i+3]
|
||||||
|
|
||||||
|
cbz X_3, SymCryptFdefMontgomeryReduceAsmInnerEnd
|
||||||
|
|
||||||
LABEL(SymCryptFdefMontgomeryReduceAsmInner)
|
LABEL(SymCryptFdefMontgomeryReduceAsmInner)
|
||||||
ldp X_10, X_11, [X_0, #32]! // pMod[j]
|
// Invariant - X_20 (c) + carry flag is a value in the range [0, 2^64-1]
|
||||||
ldp X_8, X_9, [X_1, #32]! // pSrc[j]
|
// We could always adc X_20, X_20, xzr before loop entry, but this is a wasted instruction
|
||||||
|
// vs. adcs with X_20 below
|
||||||
|
|
||||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
ldp X_8, X_9, [X_0, #32]! // pMod[j+0], pMod[j+1]
|
||||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
mul X_12, X_6, X_8 // Bits <63:0> of m*pMod[j]
|
||||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
umulh X_8, X_6, X_8 // Bits <127:64> of m*pMod[j]
|
||||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
adcs X_12, X_20, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
|
||||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
|
||||||
// ***: These cannot produce extra carry as the maximum is
|
|
||||||
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
|
|
||||||
str X_12, [X_1] // pSrc[j] = (UINT64) c
|
|
||||||
|
|
||||||
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
|
mul X_13, X_6, X_9 // Bits <63:0> of m*pMod[j+1]
|
||||||
adds X_14, X_14, X_9 // Adding pSrc[j]
|
umulh X_9, X_6, X_9 // Bits <127:64> of m*pMod[j+1]
|
||||||
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
|
ldp X_16, X_17, [X_1, #32]! // pSrc[i+j+0], pSrc[i+j+1]
|
||||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
adcs X_8, X_8, X_13 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
adds X_13, X_13, X_14 // Add the lower bits of c
|
|
||||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
|
||||||
str X_13, [X_1, #8] // pSrc[j] = (UINT64) c
|
|
||||||
|
|
||||||
ldp X_10, X_11, [X_0, #16] // pMod[j]
|
ldp X_10, X_11, [X_0, #16] // pMod[j+2], pMod[j+3]
|
||||||
ldp X_8, X_9, [X_1, #16] // pSrc[j]
|
mul X_14, X_6, X_10 // Bits <63:0> of m*pMod[j+2]
|
||||||
|
umulh X_10, X_6, X_10 // Bits <127:64> of m*pMod[j+2]
|
||||||
|
adcs X_9, X_9, X_14 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
|
|
||||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
ldp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
|
||||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
mul X_15, X_6, X_11 // Bits <63:0> of m*pMod[j+3]
|
||||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
umulh X_11, X_6, X_11 // Bits <127:64> of m*pMod[j+3]
|
||||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
adcs X_10, X_10, X_15 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
|
||||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
|
||||||
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c
|
|
||||||
|
|
||||||
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
|
adc X_20, X_11, xzr // Add the carry if any
|
||||||
adds X_14, X_14, X_9 // Adding pSrc[j]
|
|
||||||
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
|
|
||||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
|
||||||
adds X_13, X_13, X_14 // Add the lower bits of c
|
|
||||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
|
||||||
str X_13, [X_1, #24] // pSrc[j] = (UINT64) c
|
|
||||||
|
|
||||||
subs X_3, X_3, #1 // Move one digit up
|
// ( X_20, X_10, X_9, X_8, X_12 ) = (( pMod[j+3], pMod[j+2], pMod[j+1], pMod[j+0] ) * m) + c
|
||||||
bne SymCryptFdefMontgomeryReduceAsmInner
|
|
||||||
|
adds X_16, X_16, X_12 // Adding pSrc[i+j+0]
|
||||||
|
adcs X_17, X_17, X_8 // Adding pSrc[i+j+1]
|
||||||
|
adcs X_18, X_18, X_9 // Adding pSrc[i+j+2]
|
||||||
|
adcs X_19, X_19, X_10 // Adding pSrc[i+j+3]
|
||||||
|
|
||||||
|
stp X_16, X_17, [X_1] // pSrc[i+j+0], pSrc[i+j+1]
|
||||||
|
stp X_18, X_19, [X_1, #16] // pSrc[i+j+2], pSrc[i+j+3]
|
||||||
|
|
||||||
|
sub X_3, X_3, #32 // Move one digit up
|
||||||
|
cbnz X_3, SymCryptFdefMontgomeryReduceAsmInner
|
||||||
|
|
||||||
|
LABEL(SymCryptFdefMontgomeryReduceAsmInnerEnd)
|
||||||
|
|
||||||
ldr X_8, [X_1, #32] // pSrc[nWords]
|
ldr X_8, [X_1, #32] // pSrc[nWords]
|
||||||
adds X_12, X_12, X_8 // c + pSrc[nWords]
|
adcs X_20, X_20, X_7 // c + hc
|
||||||
adc X_13, xzr, xzr // Add the carry if any
|
adc X_7, xzr, xzr // Add the carry if any
|
||||||
|
|
||||||
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
|
adds X_20, X_20, X_8 // c + hc + pSrc[nWords]
|
||||||
adc X_7, X_13, xzr // Add the carry if any and store into hc
|
adc X_7, X_7, xzr // Add the carry if any and store into hc
|
||||||
|
|
||||||
str X_12, [X_1, #32] // pSrc[nWords] = c
|
str X_20, [X_1, #32] // pSrc[nWords] = c
|
||||||
|
|
||||||
subs X_4, X_4, #1 // Move one word up
|
subs X_4, X_4, #1 // Move one word up
|
||||||
|
|
||||||
add X_17, X_17, #8 // Move stored pSrc pointer one word up
|
sub X_0, X_0, X_21 // Restore pMod pointer (subtract byte count)
|
||||||
mov X_0, X_16 // Restore pMod pointer
|
sub X_1, X_1, X_21 // Restore pSrc pointer (subtract byte count)
|
||||||
mov X_1, X_17 // Restore pSrc pointer
|
add X_1, X_1, #8 // Move pSrc pointer one word up
|
||||||
|
|
||||||
mov X_3, X_15 // Restore the digit counter
|
mov X_3, X_21 // Restore the byte counter
|
||||||
|
|
||||||
bne SymCryptFdefMontgomeryReduceAsmOuter
|
bne SymCryptFdefMontgomeryReduceAsmOuter
|
||||||
|
|
||||||
//
|
//
|
||||||
// Subtraction
|
// Subtraction
|
||||||
//
|
//
|
||||||
|
add X_3, X_3, #32 // restore the full byte count for loops using unconditional pre-indexing
|
||||||
|
mov X_21, X_3 // Store the byte count for later
|
||||||
|
|
||||||
|
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
|
||||||
|
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
|
||||||
|
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
|
||||||
mov X_14, X_2 // Store pDst pointer
|
mov X_14, X_2 // Store pDst pointer
|
||||||
|
|
||||||
// Prepare the pointers for subtract
|
// Prepare the pointers for subtract
|
||||||
mov X_0, X_17 // pSrc
|
|
||||||
mov X_1, X_16 // pMod
|
|
||||||
|
|
||||||
mov X_10, X_7 // X_10 = hc
|
mov X_10, X_7 // X_10 = hc
|
||||||
mov X_3, X_15 // Restore the digit counter
|
|
||||||
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
|
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
|
||||||
|
|
||||||
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
|
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
|
||||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
sub X_3, X_3, #32 // Decrement the byte count by 32
|
||||||
// borrow is in the carry flag (flipped)
|
// borrow is in the carry flag (flipped)
|
||||||
|
|
||||||
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
|
ldp X_4, X_6, [X_1, #32]! // Load two words of pSrc
|
||||||
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
|
ldp X_5, X_7, [X_0, #32]! // Load two words of pMod
|
||||||
sbcs X_4, X_4, X_5
|
sbcs X_4, X_4, X_5
|
||||||
sbcs X_6, X_6, X_7
|
sbcs X_6, X_6, X_7
|
||||||
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
|
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
|
||||||
|
|
||||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
ldp X_4, X_6, [X_1, #16] // Load two words of pSrc
|
||||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
ldp X_5, X_7, [X_0, #16] // Load two words of pMod
|
||||||
sbcs X_4, X_4, X_5
|
sbcs X_4, X_4, X_5
|
||||||
sbcs X_6, X_6, X_7
|
sbcs X_6, X_6, X_7
|
||||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||||
|
@ -1381,14 +1402,14 @@ LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
|
||||||
orr X_11, X_10, X_0 // X_11 = hc|d
|
orr X_11, X_10, X_0 // X_11 = hc|d
|
||||||
|
|
||||||
// Prepare the pointers for masked copy
|
// Prepare the pointers for masked copy
|
||||||
mov X_0, X_17 // pSrc
|
sub X_0, X_1, X_21 // Restore pSrc pointer (subtract byte count)
|
||||||
mov X_1, X_14 // pDst
|
mov X_1, X_14 // pDst
|
||||||
|
|
||||||
mov X_2, X_15 // Restore the digit counter
|
mov X_2, X_21 // Restore the byte counter
|
||||||
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
|
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
|
||||||
|
|
||||||
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
|
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
|
||||||
sub X_2, X_2, #1 // decrement the digit count by one
|
sub X_2, X_2, #32 // decrement the byte count by 32
|
||||||
|
|
||||||
ldp X_4, X_6, [X_0, #32]! // Load two words of the source
|
ldp X_4, X_6, [X_0, #32]! // Load two words of the source
|
||||||
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination
|
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination
|
||||||
|
|
|
@ -645,26 +645,6 @@ MAPPING_ARM32_AAPCS32 = {
|
||||||
15:ARM32_R15, # PC
|
15:ARM32_R15, # PC
|
||||||
}
|
}
|
||||||
|
|
||||||
def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
|
||||||
prologue = ""
|
|
||||||
|
|
||||||
if reg_count > self.volatile_registers:
|
|
||||||
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
return prologue
|
|
||||||
|
|
||||||
def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
|
||||||
epilogue = ""
|
|
||||||
|
|
||||||
if reg_count > self.volatile_registers:
|
|
||||||
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
epilogue += " ret\n"
|
|
||||||
|
|
||||||
return epilogue
|
|
||||||
|
|
||||||
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||||
assert(not stack_alloc_size and not xmm_reg_count)
|
assert(not stack_alloc_size and not xmm_reg_count)
|
||||||
prologue = ""
|
prologue = ""
|
||||||
|
@ -697,7 +677,7 @@ def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
|
||||||
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
|
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
|
||||||
return epilogue
|
return epilogue
|
||||||
|
|
||||||
def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
def gen_prologue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||||
prologue = ""
|
prologue = ""
|
||||||
|
|
||||||
if reg_count > self.volatile_registers:
|
if reg_count > self.volatile_registers:
|
||||||
|
@ -717,7 +697,7 @@ def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
|
||||||
|
|
||||||
return prologue
|
return prologue
|
||||||
|
|
||||||
def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
def gen_epilogue_arm64_armasm64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||||
epilogue = ""
|
epilogue = ""
|
||||||
|
|
||||||
if reg_count > self.volatile_registers:
|
if reg_count > self.volatile_registers:
|
||||||
|
@ -741,17 +721,63 @@ def gen_epilogue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
|
||||||
|
|
||||||
return epilogue
|
return epilogue
|
||||||
|
|
||||||
|
def gen_prologue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||||
|
prologue = ""
|
||||||
|
|
||||||
|
if reg_count > self.volatile_registers:
|
||||||
|
# Calculate required stack space
|
||||||
|
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
|
||||||
|
registers_to_spill = 2 + reg_count - self.volatile_registers
|
||||||
|
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
|
||||||
|
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
|
||||||
|
prologue += " stp fp, lr, [sp, #-%d]! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
|
||||||
|
|
||||||
|
stack_offset = 16
|
||||||
|
for i in range(self.volatile_registers, reg_count-1, 2):
|
||||||
|
prologue += " stp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
|
||||||
|
stack_offset += 16
|
||||||
|
if registers_to_spill % 2 == 1:
|
||||||
|
prologue += " str X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
|
||||||
|
|
||||||
|
return prologue
|
||||||
|
|
||||||
|
def gen_epilogue_arm64_gas(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||||
|
epilogue = ""
|
||||||
|
|
||||||
|
if reg_count > self.volatile_registers:
|
||||||
|
# Calculate required stack space
|
||||||
|
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
|
||||||
|
registers_to_spill = 2 + reg_count - self.volatile_registers
|
||||||
|
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
|
||||||
|
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
|
||||||
|
|
||||||
|
stack_offset = required_stack_space-16
|
||||||
|
if registers_to_spill % 2 == 1:
|
||||||
|
epilogue += " ldr X_%d, [sp, #%d]\n" % (reg_count-1, stack_offset)
|
||||||
|
stack_offset -= 16
|
||||||
|
for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
|
||||||
|
epilogue += " ldp X_%d, X_%d, [sp, #%d]\n" % (i, i+1, stack_offset)
|
||||||
|
stack_offset -= 16
|
||||||
|
epilogue += " ldp fp, lr, [sp], #%d // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
|
||||||
|
epilogue += " ret\n"
|
||||||
|
|
||||||
|
return epilogue
|
||||||
|
|
||||||
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
|
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested=False):
|
||||||
logging.error("symcryptasm currently does not support memory slots for arm64!")
|
logging.error("symcryptasm currently does not support memory slots for arm64!")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
|
CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64 = CallingConvention(
|
||||||
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
|
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
|
||||||
gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
|
gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
|
||||||
|
|
||||||
|
CALLING_CONVENTION_ARM64_AAPCS64_GAS = CallingConvention(
|
||||||
|
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
|
||||||
|
gen_prologue_arm64_gas, gen_epilogue_arm64_gas, gen_get_memslot_offset_arm64)
|
||||||
|
|
||||||
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
|
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
|
||||||
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
|
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
|
||||||
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
|
gen_prologue_arm64_armasm64, gen_epilogue_arm64_armasm64, gen_get_memslot_offset_arm64)
|
||||||
|
|
||||||
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
|
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
|
||||||
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
|
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
|
||||||
|
@ -1194,7 +1220,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
|
||||||
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
|
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
|
||||||
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
|
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
|
||||||
elif architecture == "arm64" and calling_convention == "aapcs64":
|
elif architecture == "arm64" and calling_convention == "aapcs64":
|
||||||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_GAS
|
||||||
mul_calling_convention = None
|
mul_calling_convention = None
|
||||||
nested_calling_convention = None
|
nested_calling_convention = None
|
||||||
elif architecture == "arm" and calling_convention == "aapcs32":
|
elif architecture == "arm" and calling_convention == "aapcs32":
|
||||||
|
@ -1203,7 +1229,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
|
||||||
nested_calling_convention = None
|
nested_calling_convention = None
|
||||||
elif assembler == "armasm64":
|
elif assembler == "armasm64":
|
||||||
if architecture == "arm64" and calling_convention == "aapcs64":
|
if architecture == "arm64" and calling_convention == "aapcs64":
|
||||||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64_ARMASM64
|
||||||
mul_calling_convention = None
|
mul_calling_convention = None
|
||||||
nested_calling_convention = None
|
nested_calling_convention = None
|
||||||
elif architecture == "arm64" and calling_convention == "arm64ec":
|
elif architecture == "arm64" and calling_convention == "arm64ec":
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
{ "major": 103, "minor": 4, "patch": 2 }
|
{ "major": 103, "minor": 4, "patch": 3 }
|
Загрузка…
Ссылка в новой задаче