зеркало из https://github.com/microsoft/SymCrypt.git
Merged PR 6438924: Enable SymCryptAsm for Arm64
+ Extends SymCryptAsm format and script to work in the Arm64 context + Now specify architecture, assembler, and calling convention in script invocation + Make various changes to assembly to remove redundant instructions, and generally slightly improve perf for all platforms (a couple of % here and there) + Use assembly routines in Linux builds and remove asmstubs file + Do not enable Windows Arm64 build with CMake yet Related work items: #35613721
This commit is contained in:
Родитель
c5ef94321c
Коммит
2bc541799d
|
@ -61,6 +61,9 @@ if(WIN32)
|
|||
else()
|
||||
if(NOT SYMCRYPT_TARGET_ENV MATCHES "Generic")
|
||||
enable_language(ASM)
|
||||
# Suppress noisy warnings about compile options which are ignored for ASM
|
||||
# Less messy than restricting most of the below options to only C/CXX!
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-Wno-unused-command-line-argument>)
|
||||
endif()
|
||||
# add_compile_options(-Wall)
|
||||
# add_compile_options(-Wno-unknown-pragmas)
|
||||
|
@ -76,6 +79,12 @@ else()
|
|||
# Avoids error: cast from pointer to smaller type 'uintptr_t' when including <memory> from aarch64-linux-gnu
|
||||
add_compile_options(-fms-extensions)
|
||||
|
||||
# GCC and clang unroll more aggressively than they should for best performance
|
||||
# When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
|
||||
# (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
|
||||
# using GCC-specific pragmas for the loops of interest)
|
||||
add_compile_options(-fno-unroll-loops)
|
||||
|
||||
# In Sanitize version, enable sanitizers
|
||||
if (CMAKE_BUILD_TYPE MATCHES Sanitize)
|
||||
add_compile_options(-fsanitize=address)
|
||||
|
@ -120,12 +129,6 @@ else()
|
|||
add_link_options(-fsanitize=vptr)
|
||||
add_link_options(-fno-sanitize-recover=all)
|
||||
endif()
|
||||
|
||||
# GCC and clang unroll more aggressively than they should for best performance
|
||||
# When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
|
||||
# (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
|
||||
# using GCC-specific pragmas for the loops of interest)
|
||||
add_compile_options(-fno-unroll-loops)
|
||||
endif()
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES Release)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This toolchain file configures CMake options for Linux User Mode AMD64 compilation with CPU optimizations.
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-AMD64.cmake
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-AMD64.cmake"
|
||||
|
||||
# Set CMake variables that subsequent CMake scripts can check against
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This toolchain file configures CMake options for Linux User Mode ARM64 compilation with CPU optimizations.
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-ARM64.cmake
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-ARM64.cmake"
|
||||
|
||||
# Set CMake variables that subsequent CMake scripts can check against
|
||||
set(CMAKE_SYSTEM_NAME Linux)
|
||||
|
@ -8,13 +8,14 @@ set(CMAKE_SYSTEM_PROCESSOR ARM64)
|
|||
set(TARGET_TRIPLE aarch64-linux-gnu)
|
||||
|
||||
# Currently only use clang as it makes cross-compilation easier
|
||||
set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
set(CMAKE_C_COMPILER clang)
|
||||
set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
set(CMAKE_CXX_COMPILER clang++)
|
||||
set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
|
||||
# Point clang sysroot to cross compilation toolchain when cross compiling
|
||||
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR EQUAL CMAKE_SYSTEM_PROCESSOR)
|
||||
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES ARM64|aarch64)
|
||||
# C/C++ toolchain (installed on Ubuntu using apt-get gcc-aarch64-linux-gnu g++-aarch64-linux-gnu)
|
||||
set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This toolchain file configures CMake options for Windows User Mode AMD64 compilation with CPU optimizations.
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-AMD64.cmake
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-AMD64.cmake"
|
||||
|
||||
# Set CMake variables that subsequent CMake scripts can check against
|
||||
set(CMAKE_SYSTEM_NAME Windows)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This toolchain file configures CMake options for Windows User Mode x86 compilation with CPU optimizations.
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-X86.cmake -A Win32
|
||||
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-X86.cmake" -A Win32
|
||||
#
|
||||
# (The "-A Win32" option seems to be required when compiling on a 64-bit host. Ideally this toolchain file
|
||||
# should set all the required options, but I haven't figured out how to force 32-bit compilation from the
|
||||
|
|
|
@ -105,7 +105,7 @@ function(process_cppasm filepath outformat archdefine)
|
|||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
|
||||
endif()
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
|
@ -151,14 +151,20 @@ function(process_cppasm filepath outformat archdefine)
|
|||
endif()
|
||||
endfunction()
|
||||
|
||||
function(process_symcryptasm filepath outformat archdefine)
|
||||
function(process_symcryptasm filepath outformat archdefine callingconvention)
|
||||
get_filename_component(fileextension ${filepath} EXT)
|
||||
if(NOT fileextension STREQUAL .symcryptasm)
|
||||
message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
|
||||
endif()
|
||||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
|
||||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
|
||||
endif()
|
||||
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
|
||||
endif()
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
|
||||
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
|
||||
|
@ -168,7 +174,7 @@ function(process_symcryptasm filepath outformat archdefine)
|
|||
add_custom_command(
|
||||
OUTPUT ${output_cppasm}
|
||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
|
||||
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
|
||||
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${archdefine} ${callingconvention} ${filepath} ${output_cppasm}
|
||||
MAIN_DEPENDENCY ${filepath}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
|
||||
COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
|
||||
|
@ -183,19 +189,15 @@ else()
|
|||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
|
||||
list(APPEND SOURCES_COMMON linux/intrinsics.c)
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
|
||||
list(APPEND SOURCES_COMMON linux/asmstubs.c)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64 msft)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64 msft)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64 msft)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64 msft)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm masm amd64 msft)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
amd64/aesasm-masm.asm
|
||||
|
@ -229,11 +231,11 @@ if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
|
|||
endif()
|
||||
elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
|
||||
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64 systemv)
|
||||
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64 systemv)
|
||||
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64 systemv)
|
||||
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64 systemv)
|
||||
process_symcryptasm(amd64/wipe.symcryptasm gas amd64 systemv)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
amd64/aesasm-gas.asm
|
||||
|
@ -248,6 +250,20 @@ elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
|
|||
amd64/fdef_mulx-gas.asm
|
||||
amd64/wipe-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
|
||||
process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
|
||||
process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
|
||||
process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
arm64/fdef_asm-gas.asm
|
||||
arm64/fdef369_asm-gas.asm
|
||||
arm64/wipe-gas.asm)
|
||||
set_source_files_properties(
|
||||
arm64/fdef_asm-gas.asm
|
||||
arm64/fdef369_asm-gas.asm
|
||||
arm64/wipe-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
|
|||
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY1024, // Special faster code for 1024-bit Montgomery moduli
|
||||
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY_MULX1024, // Special faster code for 1024-bit Montgomery moduli, MULX-based code
|
||||
|
||||
#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
|
||||
#elif SYMCRYPT_CPU_ARM64
|
||||
|
||||
SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY,
|
||||
{NULL,},
|
||||
|
@ -68,7 +68,7 @@ const SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
|
|||
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 0, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('1M' << 16) + SymCryptModFntableMontgomery1024, 0, 1024, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
|
||||
#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
|
||||
#elif SYMCRYPT_CPU_ARM64
|
||||
|
||||
{('mM' << 16) + SymCryptModFntableMontgomery, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
|
||||
|
|
|
@ -1,472 +0,0 @@
|
|||
;
|
||||
; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
|
||||
;
|
||||
; This file contains alternative routines that pretend that each digit is only 3 words.
|
||||
; This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
|
||||
; The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
|
||||
;
|
||||
; Most of this code is a direct copy of the default code.
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
||||
#include "ksarm64.h"
|
||||
|
||||
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
|
||||
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
|
||||
#define SYMCRYPT_MASM
|
||||
#include "C_asm_shared.inc"
|
||||
#undef SYMCRYPT_MASM
|
||||
|
||||
#include "symcrypt_version.inc"
|
||||
#include "symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_magic.inc"
|
||||
|
||||
; A digit consists of 3 words of 64 bits each
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
; SymCryptFdef369RawAdd(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
; UINT32 nDigits );
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; pSrc2 -> x1
|
||||
; pDst -> x2
|
||||
; nDigits -> x3
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
|
||||
|
||||
neg x3, x3 ; negate the digit count
|
||||
ands x4, x4, x4 ; Zero the carry flag
|
||||
|
||||
SymCryptFdef369RawAddAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; carry is in the carry flag
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
adcs x4, x4, x5
|
||||
adcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
ldr x4, [x0], #8
|
||||
ldr x5, [x1], #8
|
||||
adcs x4, x4, x5
|
||||
str x4, [x2], #8
|
||||
|
||||
cbnz x3, SymCryptFdef369RawAddAsmLoop
|
||||
|
||||
csetcs x0 ; Set the return value equal to the carry
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdef369RawSub(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
; UINT32 nDigits )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; pSrc2 -> x1
|
||||
; pDst -> x2
|
||||
; nDigits -> x3
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
|
||||
|
||||
neg x3, x3 ; negate the digit count
|
||||
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
|
||||
|
||||
SymCryptFdef369RawSubAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; borrow is in the carry flag (flipped)
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
ldr x4, [x0], #8
|
||||
ldr x5, [x1], #8
|
||||
sbcs x4, x4, x5
|
||||
str x4, [x2], #8
|
||||
|
||||
cbnz x3, SymCryptFdef369RawSubAsmLoop
|
||||
|
||||
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdef369MaskedCopy(
|
||||
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
; UINT32 nDigits,
|
||||
; UINT32 mask )
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
|
||||
|
||||
neg x2, x2 ; negate the digit count
|
||||
subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow)
|
||||
|
||||
SymCryptFdef369MaskedCopyAsmLoop
|
||||
add x2, x2, #1 ; Increment the digit count by one
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of the source
|
||||
ldp x5, x7, [x1] ; Load two words of the destination
|
||||
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16 ; Store the two words in the destination
|
||||
|
||||
ldr x4, [x0], #8
|
||||
ldr x5, [x1]
|
||||
cselcc x4, x4, x5
|
||||
str x4, [x1], #8
|
||||
|
||||
cbnz x2, SymCryptFdef369MaskedCopyAsmLoop
|
||||
|
||||
; Done, no return value
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdef369RawMul(
|
||||
; _In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
; UINT32 nDigits1,
|
||||
; _In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
; UINT32 nDigits2,
|
||||
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; nDigits1 -> x1
|
||||
; pSrc2 -> x2
|
||||
; nDigits2 -> x3
|
||||
; pDst -> x4
|
||||
;
|
||||
; Basic structure:
|
||||
; for each word in Src1:
|
||||
; Dst += Src2 * word
|
||||
;
|
||||
; Register assignments
|
||||
; x0 = pSrc1 (moving forward one word every outer loop)
|
||||
; x1 = negated word count of pSrc1
|
||||
; x2 = pSrc2 (moving forward one *digit* every inner loop)
|
||||
; x3 = negated digit count of pSrc2 and pDst
|
||||
; x4 = pDst (moving forward one *digit* every inner loop)
|
||||
; x5 = Stored pDst (moving forward one word every outer loop)
|
||||
; x6 = Current word loaded from pSrc1
|
||||
; x8, x9 = Current words loaded in pairs from pSrc2
|
||||
; x10, x11 = Current words loaded in pairs from pDst
|
||||
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
|
||||
; x16 = Stored pSrc2
|
||||
; x17 = Stored negated digit count of pSrc2
|
||||
; Note x13, x14 are reserved in ARM64EC and thus are not used
|
||||
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
|
||||
|
||||
add x1, x1, x1, LSL #1 ; Calculate word count (x1 * 3)
|
||||
|
||||
neg x1, x1 ; negate nWords1
|
||||
neg x3, x3 ; negate nDigits2
|
||||
|
||||
mov x5, x4 ; store pDst
|
||||
mov x16, x2 ; store pSrc2
|
||||
mov x17, x3 ; store -nDigits2 for later
|
||||
|
||||
;
|
||||
; First iteration of main loop (no adding of previous values from pDst)
|
||||
;
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0] ; load the first word from pSrc1
|
||||
|
||||
SymCryptFdef369RawMulAsmLoopInner1
|
||||
add x3, x3, #1 ; move one digit up
|
||||
|
||||
ldp x8, x9, [x2], #16 ; load 2 words from pSrc2
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
ldr x8, [x2], #8
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
cbnz x3, SymCryptFdef369RawMulAsmLoopInner1
|
||||
|
||||
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
|
||||
str x15, [x4]
|
||||
|
||||
add x1, x1, #1 ; move one word up
|
||||
add x0, x0, #8 ; move start of pSrc1 one word up
|
||||
add x5, x5, #8 ; move start of pDst one word up
|
||||
|
||||
;
|
||||
; MAIN LOOP
|
||||
;
|
||||
SymCryptFdef369RawMulAsmLoopOuter
|
||||
mov x3, x17 ; set -nDigits2
|
||||
mov x2, x16 ; set pSrc2
|
||||
mov x4, x5 ; set pDst
|
||||
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0] ; load the next word from pSrc1
|
||||
|
||||
SymCryptFdef369RawMulAsmLoopInner
|
||||
add x3, x3, #1 ; move one digit up
|
||||
|
||||
ldp x8, x9, [x2], #16 ; load 2 words from pSrc2
|
||||
ldp x10, x11, [x4] ; load 2 words from pDst
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
|
||||
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
ldr x8, [x2], #8
|
||||
ldr x10, [x4]
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4], #8 ; Store to destination
|
||||
|
||||
cbnz x3, SymCryptFdef369RawMulAsmLoopInner
|
||||
|
||||
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
|
||||
str x15, [x4]
|
||||
|
||||
adds x1, x1, #1 ; move one word up
|
||||
add x0, x0, #8 ; move start of pSrc1 one word up
|
||||
add x5, x5, #8 ; move start of pDst one word up
|
||||
|
||||
bne SymCryptFdef369RawMulAsmLoopOuter
|
||||
|
||||
; Done, no return value
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
|
||||
|
||||
;VOID
|
||||
;SymCryptFdef369MontgomeryReduceAsm(
|
||||
; _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
; _In_ PUINT32 pSrc,
|
||||
; _Out_ PUINT32 pDst )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pmMod -> x0
|
||||
; pSrc -> x1
|
||||
; pDst -> x2
|
||||
;
|
||||
; Register assignments
|
||||
; x0 = pMod (moving forward one *digit* every inner loop)
|
||||
; x1 = pSrc (moving forward one *digit* every inner loop)
|
||||
; x2 = pDst (used only in the end for subtract / result)
|
||||
; x3 = negated digit count of pSrc and pMod
|
||||
; x4 = negated word count of pSrc
|
||||
; x5 = Inv64 of the modulus
|
||||
; x6 = m = pSrc[i]*Inv64
|
||||
; x7 = hc = high carry variable
|
||||
; x8, x9 = Current words loaded in pairs from pSrc
|
||||
; x10, x11 = Current words loaded in pairs from pMod
|
||||
; x12, x15 = c variable = "128-bit" sliding register to hold the result of multiplies
|
||||
; x16 = Temporary intermediate result
|
||||
; x17 = Stored negated digit count of pSrc
|
||||
; x19 = Stored pMod pointer
|
||||
; x20 = Stored pSrc pointer (moving forward one word every outer loop)
|
||||
; Note x13, x14 are reserved in ARM64EC and thus are not used
|
||||
|
||||
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
|
||||
PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR
|
||||
PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20
|
||||
|
||||
ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits
|
||||
ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus
|
||||
add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod
|
||||
|
||||
add x4, x3, x3, LSL #1 ; Calculate word count (x3 * 3)
|
||||
|
||||
neg x3, x3 ; Negate the digit count
|
||||
neg x4, x4 ; Negate the word count
|
||||
|
||||
mov x17, x3 ; Store the digit count for later
|
||||
mov x19, x0 ; Store the pMod pointer
|
||||
mov x20, x1 ; Store the pSrc pointer
|
||||
|
||||
ands x7, x7, XZR ; Set hc to 0
|
||||
|
||||
;
|
||||
; Main loop
|
||||
;
|
||||
SymCryptFdef369MontgomeryReduceAsmOuter
|
||||
ldr x8, [x1] ; Load 1 word from pSrc
|
||||
mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m
|
||||
|
||||
ands x12, x12, XZR ; Set c to 0
|
||||
ands x15, x15, XZR ; Set c to 0
|
||||
|
||||
SymCryptFdef369MontgomeryReduceAsmInner
|
||||
ldp x10, x11, [x0], #16 ; pMod[j]
|
||||
ldp x8, x9, [x1] ; pSrc[j]
|
||||
|
||||
mul x16, x6, x10 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x8 ; Adding pSrc[j]
|
||||
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
adds x12, x12, x16 ; Add the lower bits of c
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
; ***: These cannot produce extra carry as the maximum is
|
||||
; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
|
||||
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
|
||||
mov x12, x15 ; c >>= 64
|
||||
|
||||
mul x16, x6, x11 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x9 ; Adding pSrc[j]
|
||||
umulh x15, x6, x11 ; <127:64> of pMod[j]*m
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
adds x12, x12, x16 ; Add the lower bits of c
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
|
||||
mov x12, x15 ; c >>= 64
|
||||
|
||||
ldr x10, [x0], #8 ; pMod[j]
|
||||
ldr x8, [x1] ; pSrc[j]
|
||||
|
||||
mul x16, x6, x10 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x8 ; Adding pSrc[j]
|
||||
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
adds x12, x12, x16 ; Add the lower bits of c
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
|
||||
mov x12, x15 ; c >>= 64
|
||||
|
||||
adds x3, x3, #1 ; Move one digit up
|
||||
bne SymCryptFdef369MontgomeryReduceAsmInner
|
||||
|
||||
ldr x8, [x1] ; pSrc[nWords]
|
||||
adds x12, x12, x8 ; c + pSrc[nWords]
|
||||
adc x15, XZR, XZR ; Add the carry if any
|
||||
|
||||
adds x12, x12, x7 ; c + pSrc[nWords] + hc
|
||||
adc x7, x15, XZR ; Add the carry if any and store into hc
|
||||
|
||||
str x12, [x1] ; pSrc[nWords] = c
|
||||
|
||||
adds x4, x4, #1 ; Move one word up
|
||||
|
||||
add x20, x20, #8 ; Move stored pSrc pointer one word up
|
||||
mov x0, x19 ; Restore pMod pointer
|
||||
mov x1, x20 ; Restore pSrc pointer
|
||||
|
||||
mov x3, x17 ; Restore the digit counter
|
||||
|
||||
bne SymCryptFdef369MontgomeryReduceAsmOuter
|
||||
|
||||
;
|
||||
; Subtraction
|
||||
;
|
||||
|
||||
mov x16, x2 ; Store pDst pointer
|
||||
|
||||
; Prepare the pointers for subtract
|
||||
mov x0, x20 ; pSrc
|
||||
mov x1, x19 ; pMod
|
||||
|
||||
mov x10, x7 ; x10 = hc
|
||||
mov x3, x17 ; Restore the digit counter
|
||||
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceRawSubAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; borrow is in the carry flag (flipped)
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
ldr x4, [x0], #8
|
||||
ldr x5, [x1], #8
|
||||
sbcs x4, x4, x5
|
||||
str x4, [x2], #8
|
||||
|
||||
cbnz x3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
|
||||
|
||||
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
orr x11, x10, x0 ; x11 = hc|d
|
||||
|
||||
; Prepare the pointers for masked copy
|
||||
mov x0, x20 ; pSrc
|
||||
mov x1, x16 ; pDst
|
||||
|
||||
mov x2, x17 ; Restore the digit counter
|
||||
subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow)
|
||||
|
||||
SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
|
||||
add x2, x2, #1 ; Increment the digit count by one
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of the source
|
||||
ldp x5, x7, [x1] ; Load two words of the destination
|
||||
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16 ; Store the two words in the destination
|
||||
|
||||
ldr x4, [x0], #8
|
||||
ldr x5, [x1]
|
||||
cselcc x4, x4, x5
|
||||
str x4, [x1], #8
|
||||
|
||||
cbnz x2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
|
||||
|
||||
; Done, no return value
|
||||
|
||||
EPILOG_RESTORE_REG_PAIR x19, x20, #16
|
||||
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
|
||||
EPILOG_RETURN
|
||||
|
||||
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
|
||||
|
||||
END
|
||||
|
|
@ -0,0 +1,465 @@
|
|||
//
|
||||
// fdef369_asm.symcryptasm Assembler code for large integer arithmetic in the default data format
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// This file contains alternative routines that pretend that each digit is only 3 words.
|
||||
// This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
|
||||
// The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
|
||||
//
|
||||
// Most of this code is a direct copy of the default code.
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
// A digit consists of 3 words of 64 bits each
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdef369RawAddAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
// UINT32 nDigits )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm), 4, 8)
|
||||
|
||||
ldp X_4, X_6, [X_0] // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1] // Load two words of pSrc2
|
||||
adds X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2] // Store the result in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldr X_5, [X_1, #16] // Load one word of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
str X_4, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbz X_3, SymCryptFdef369RawAddAsmEnd
|
||||
|
||||
LABEL(SymCryptFdef369RawAddAsmLoop)
|
||||
// carry is in the carry flag
|
||||
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
|
||||
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldr X_5, [X_1, #16] // Load one word of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
str X_4, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdef369RawAddAsmLoop
|
||||
|
||||
ALIGN(4)
|
||||
LABEL(SymCryptFdef369RawAddAsmEnd)
|
||||
cset X_0, cs // Set the return value equal to the carry
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm))
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdef369RawSubAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
// UINT32 nDigits )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm), 4, 8)
|
||||
|
||||
ldp X_4, X_6, [X_0] // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1] // Load two words of pSrc2
|
||||
subs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2] // Store the result in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldr X_5, [X_1, #16] // Load one word of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
str X_4, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbz X_3, SymCryptFdef369RawSubAsmEnd
|
||||
|
||||
LABEL(SymCryptFdef369RawSubAsmLoop)
|
||||
// borrow is in the carry flag (flipped)
|
||||
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
|
||||
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldr X_5, [X_1, #16] // Load one word of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
str X_4, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdef369RawSubAsmLoop
|
||||
|
||||
ALIGN(4)
|
||||
LABEL(SymCryptFdef369RawSubAsmEnd)
|
||||
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdef369MaskedCopyAsm(
|
||||
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
// UINT32 nDigits,
|
||||
// UINT32 mask )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm), 4, 7)
|
||||
|
||||
subs xzr, xzr, X_3 // If (X_3 > 0) clear the carry flag (i.e. borrow)
|
||||
|
||||
ldp X_3, X_5, [X_0] // Load two words of the source
|
||||
ldp X_4, X_6, [X_1] // Load two words of the destination
|
||||
csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand
|
||||
csel X_5, X_5, X_6, cc
|
||||
stp X_3, X_5, [X_1] // Store the two words in the destination
|
||||
|
||||
ldr X_3, [X_0, #16] // Load one word of the source
|
||||
sub X_2, X_2, #1 // Decrement the digit count by one
|
||||
ldr X_4, [X_1, #16] // Load one word of the destination
|
||||
csel X_3, X_3, X_4, cc
|
||||
str X_3, [X_1, #16] // Store the one word in the destination
|
||||
|
||||
cbz X_2, SymCryptFdef369MaskedCopyAsmEnd
|
||||
|
||||
LABEL(SymCryptFdef369MaskedCopyAsmLoop)
|
||||
ldp X_3, X_5, [X_0, #24]! // Load two words of the source
|
||||
ldp X_4, X_6, [X_1, #24]! // Load two words of the destination
|
||||
csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand
|
||||
csel X_5, X_5, X_6, cc
|
||||
stp X_3, X_5, [X_1] // Store the two words in the destination
|
||||
|
||||
ldr X_3, [X_0, #16] // Load one word of the source
|
||||
sub X_2, X_2, #1 // Decrement the digit count by one
|
||||
ldr X_4, [X_1, #16] // Load one word of the destination
|
||||
csel X_3, X_3, X_4, cc
|
||||
str X_3, [X_1, #16] // Store the one word in the destination
|
||||
|
||||
cbnz X_2, SymCryptFdef369MaskedCopyAsmLoop
|
||||
|
||||
LABEL(SymCryptFdef369MaskedCopyAsmEnd)
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdef369RawMulAsm(
|
||||
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
// UINT32 nDigits1,
|
||||
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
// UINT32 nDigits2,
|
||||
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
//
|
||||
// Basic structure:
|
||||
// for each word in Src1:
|
||||
// Dst += Src2 * word
|
||||
//
|
||||
// Register assignments
|
||||
// X_0 = pSrc1 (moving forward one word every outer loop)
|
||||
// X_1 = word count of pSrc1
|
||||
// X_2 = pSrc2 (moving forward one *digit* every inner loop)
|
||||
// X_3 = digit count of pSrc2 and pDst
|
||||
// X_4 = pDst (moving forward one *digit* every inner loop)
|
||||
// X_5 = Stored pDst (moving forward one word every outer loop)
|
||||
// X_6 = Current word loaded from pSrc1
|
||||
// X_7, X_8 = Current words loaded in pairs from pSrc2
|
||||
// X_9, X_10 = Current words loaded in pairs from pDst
|
||||
// X_11, X_12 = Scratch registers for holding the results of multiplies
|
||||
// X_13 = Stored pSrc2
|
||||
// X_14 = Stored digit count of pSrc2
|
||||
// X_15 = Scratch register for holding the results of multiplies
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm), 5, 16)
|
||||
|
||||
add X_1, X_1, X_1, LSL #1 // Calculate word count (X_1 * 3)
|
||||
|
||||
sub X_2, X_2, #24 // offset pSrc2 so we can use pre-increment form of loads
|
||||
sub X_4, X_4, #24 // offset pDst so we can use pre-increment form of loads
|
||||
|
||||
mov X_5, X_4 // store pDst
|
||||
mov X_13, X_2 // store pSrc2
|
||||
mov X_14, X_3 // store nDigits2 for later
|
||||
|
||||
//
|
||||
// First iteration of main loop (no adding of previous values from pDst)
|
||||
//
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0] // load the first word from pSrc1
|
||||
|
||||
LABEL(SymCryptFdef369RawMulAsmLoopInner1)
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
|
||||
ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j]
|
||||
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j]
|
||||
|
||||
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
|
||||
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
|
||||
|
||||
stp X_11, X_15, [X_4, #24]! // Store to destination
|
||||
ldr X_7, [X_2, #16] // load 1 word from pSrc2
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
|
||||
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
|
||||
|
||||
str X_11, [X_4, #16] // Store to destination
|
||||
|
||||
cbnz X_3, SymCryptFdef369RawMulAsmLoopInner1
|
||||
|
||||
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
|
||||
str X_12, [X_4, #24]
|
||||
|
||||
sub X_1, X_1, #1 // move one word up
|
||||
add X_0, X_0, #8 // move start of pSrc1 one word up
|
||||
add X_5, X_5, #8 // move start of pDst one word up
|
||||
|
||||
//
|
||||
// MAIN LOOP
|
||||
//
|
||||
LABEL(SymCryptFdef369RawMulAsmLoopOuter)
|
||||
mov X_3, X_14 // set nDigits2
|
||||
mov X_2, X_13 // set pSrc2
|
||||
mov X_4, X_5 // set pDst
|
||||
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0] // load the next word from pSrc1
|
||||
|
||||
LABEL(SymCryptFdef369RawMulAsmLoopInner)
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
|
||||
ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2
|
||||
ldp X_9, X_10, [X_4, #24]! // load 2 words from pDst
|
||||
|
||||
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j]
|
||||
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
|
||||
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j]
|
||||
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
|
||||
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
|
||||
stp X_9, X_10, [X_4] // Store to destination
|
||||
|
||||
ldr X_7, [X_2, #16] // load 1 word from pSrc2
|
||||
ldr X_9, [X_4, #16] // load 1 word from pDst
|
||||
|
||||
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
|
||||
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
|
||||
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
|
||||
str X_9, [X_4, #16] // Store to destination
|
||||
|
||||
cbnz X_3, SymCryptFdef369RawMulAsmLoopInner
|
||||
|
||||
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
|
||||
str X_12, [X_4, #24]
|
||||
|
||||
subs X_1, X_1, #1 // move one word up
|
||||
add X_0, X_0, #8 // move start of pSrc1 one word up
|
||||
add X_5, X_5, #8 // move start of pDst one word up
|
||||
|
||||
bne SymCryptFdef369RawMulAsmLoopOuter
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm))
|
||||
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdef369MontgomeryReduceAsm(
|
||||
// _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
// _Inout_ PUINT32 pSrc,
|
||||
// _Out_ PUINT32 pDst )
|
||||
//
|
||||
// Register assignments
|
||||
// X_0 = pMod (moving forward one *digit* every inner loop)
|
||||
// X_1 = pSrc (moving forward one *digit* every inner loop)
|
||||
// X_2 = pDst (used only in the end for subtract / result)
|
||||
// X_3 = digit count of pSrc and pMod
|
||||
// X_4 = word count of pSrc
|
||||
// X_5 = Inv64 of the modulus
|
||||
// X_6 = m = pSrc[i]*Inv64
|
||||
// X_7 = hc = high carry variable
|
||||
// X_8, X_9 = Current words loaded in pairs from pSrc
|
||||
// X_10, X_11 = Current words loaded in pairs from pMod
|
||||
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
|
||||
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
|
||||
// X_14 = Temporary intermediate result
|
||||
// X_15 = Stored digit count of pSrc
|
||||
// X_16 = Stored pMod pointer
|
||||
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
|
||||
|
||||
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
|
||||
ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus
|
||||
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
|
||||
|
||||
add X_4, X_3, X_3, LSL #1 // Calculate word count (X_3 * 3)
|
||||
|
||||
sub X_0, X_0, #24 // offset pMod so we can use pre-increment form of loads
|
||||
sub X_1, X_1, #24 // offset pSrc so we can use pre-increment form of loads
|
||||
sub X_2, X_2, #24 // offset pDst so we can use pre-increment form of loads
|
||||
|
||||
mov X_15, X_3 // Store the digit count for later
|
||||
mov X_16, X_0 // Store the pMod pointer
|
||||
mov X_17, X_1 // Store the pSrc pointer
|
||||
|
||||
and X_7, X_7, xzr // Set hc to 0
|
||||
|
||||
//
|
||||
// Main loop
|
||||
//
|
||||
LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
|
||||
ldr X_8, [X_1, #24] // Load 1 word from pSrc
|
||||
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
||||
|
||||
and X_12, X_12, xzr // Set c to 0
|
||||
|
||||
LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
|
||||
ldp X_10, X_11, [X_0, #24]! // pMod[j]
|
||||
ldp X_8, X_9, [X_1, #24]! // pSrc[j]
|
||||
|
||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
// ***: These cannot produce extra carry as the maximum is
|
||||
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
|
||||
str X_12, [X_1] // pSrc[j] = (UINT64) c4) c
|
||||
mov X_12, X_13 // c >>= 64
|
||||
|
||||
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_9 // Adding pSrc[j]
|
||||
umulh X_13, X_6, X_11 // <127:64> of pMod[j]*m
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
str X_12, [X_1, #8] // pSrc[j] = (UINT64) c
|
||||
mov X_12, X_13 // c >>= 64
|
||||
|
||||
ldr X_10, [X_0, #16] // pMod[j]
|
||||
ldr X_8, [X_1, #16] // pSrc[j]
|
||||
|
||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c4) c
|
||||
mov X_12, X_13 // c >>= 64
|
||||
|
||||
subs X_3, X_3, #1 // Move one digit up
|
||||
bne SymCryptFdef369MontgomeryReduceAsmInner
|
||||
|
||||
ldr X_8, [X_1, #24] // pSrc[nWords]
|
||||
adds X_12, X_12, X_8 // c + pSrc[nWords]
|
||||
adc X_13, xzr, xzr // Add the carry if any
|
||||
|
||||
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
|
||||
adc X_7, X_13, xzr // Add the carry if any and store into hc
|
||||
|
||||
str X_12, [X_1, #24] // pSrc[nWords] = c
|
||||
|
||||
subs X_4, X_4, #1 // Move one word up
|
||||
|
||||
add X_17, X_17, #8 // Move stored pSrc pointer one word up
|
||||
mov X_0, X_16 // Restore pMod pointer
|
||||
mov X_1, X_17 // Restore pSrc pointer
|
||||
|
||||
mov X_3, X_15 // Restore the digit counter
|
||||
|
||||
bne SymCryptFdef369MontgomeryReduceAsmOuter
|
||||
|
||||
//
|
||||
// Subtraction
|
||||
//
|
||||
|
||||
mov X_14, X_2 // Store pDst pointer
|
||||
|
||||
// Prepare the pointers for subtract
|
||||
mov X_0, X_17 // pSrc
|
||||
mov X_1, X_16 // pMod
|
||||
|
||||
mov X_10, X_7 // X_10 = hc
|
||||
mov X_3, X_15 // Restore the digit counter
|
||||
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
|
||||
|
||||
LABEL(SymCryptFdef369MontgomeryReduceRawSubAsmLoop)
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
// borrow is in the carry flag (flipped)
|
||||
|
||||
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of pSrc1
|
||||
ldr X_5, [X_1, #16] // Load one word of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
str X_4, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
|
||||
|
||||
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
orr X_11, X_10, X_0 // X_11 = hc|d
|
||||
|
||||
// Prepare the pointers for masked copy
|
||||
mov X_0, X_17 // pSrc
|
||||
mov X_1, X_14 // pDst
|
||||
|
||||
mov X_2, X_15 // Restore the digit counter
|
||||
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
|
||||
|
||||
LABEL(SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop)
|
||||
sub X_2, X_2, #1 // decrement the digit count by one
|
||||
|
||||
ldp X_4, X_6, [X_0, #24]! // Load two words of the source
|
||||
ldp X_5, X_7, [X_1, #24]! // Load two words of the destination
|
||||
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
|
||||
csel X_6, X_6, X_7, cc
|
||||
stp X_4, X_6, [X_1] // Store the two words in the destination
|
||||
|
||||
ldr X_4, [X_0, #16] // Load one word of the source
|
||||
ldr X_5, [X_1, #16] // Load one word of the destination
|
||||
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
|
||||
str X_4, [X_1, #16] // Store the one word in the destination
|
||||
|
||||
cbnz X_2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm))
|
||||
|
||||
FILE_END()
|
|
@ -1,768 +0,0 @@
|
|||
;
|
||||
; fdef_asm.asm Assembler code for large integer arithmetic in the default data format for the arm64 architecture
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
|
||||
#include "ksarm64.h"
|
||||
|
||||
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
|
||||
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
|
||||
#define SYMCRYPT_MASM
|
||||
#include "C_asm_shared.inc"
|
||||
#undef SYMCRYPT_MASM
|
||||
|
||||
#include "symcrypt_version.inc"
|
||||
#include "symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_magic.inc"
|
||||
|
||||
; A digit consists of 4 words of 64 bits each
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
; SymCryptFdefRawAdd(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
; UINT32 nDigits );
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; pSrc2 -> x1
|
||||
; pDst -> x2
|
||||
; nDigits -> x3
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
|
||||
|
||||
neg x3, x3 ; negate the digit count
|
||||
ands x4, x4, x4 ; Zero the carry flag
|
||||
|
||||
SymCryptFdefRawAddAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; carry is in the carry flag
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
adcs x4, x4, x5
|
||||
adcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
adcs x4, x4, x5
|
||||
adcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
cbnz x3, SymCryptFdefRawAddAsmLoop
|
||||
|
||||
csetcs x0 ; Set the return value equal to the carry
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
|
||||
|
||||
;UINT32
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefRawSub(
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
; UINT32 nDigits )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; pSrc2 -> x1
|
||||
; pDst -> x2
|
||||
; nDigits -> x3
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
|
||||
|
||||
neg x3, x3 ; negate the digit count
|
||||
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
|
||||
|
||||
SymCryptFdefRawSubAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; borrow is in the carry flag (flipped)
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
cbnz x3, SymCryptFdefRawSubAsmLoop
|
||||
|
||||
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefMaskedCopy(
|
||||
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
; UINT32 nDigits,
|
||||
; UINT32 mask )
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
|
||||
|
||||
neg x2, x2 ; negate the digit count
|
||||
subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow)
|
||||
|
||||
SymCryptFdefMaskedCopyAsmLoop
|
||||
add x2, x2, #1 ; Increment the digit count by one
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of the source
|
||||
ldp x5, x7, [x1] ; Load two words of the destination
|
||||
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16 ; Store the two words in the destination
|
||||
|
||||
ldp x4, x6, [x0], #16
|
||||
ldp x5, x7, [x1]
|
||||
cselcc x4, x4, x5
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16
|
||||
|
||||
cbnz x2, SymCryptFdefMaskedCopyAsmLoop
|
||||
|
||||
; Done, no return value
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
|
||||
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptFdefRawMul(
|
||||
; _In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
; UINT32 nDigits1,
|
||||
; _In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
; UINT32 nDigits2,
|
||||
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc1 -> x0
|
||||
; nDigits1 -> x1
|
||||
; pSrc2 -> x2
|
||||
; nDigits2 -> x3
|
||||
; pDst -> x4
|
||||
;
|
||||
; Basic structure:
|
||||
; for each word in Src1:
|
||||
; Dst += Src2 * word
|
||||
;
|
||||
; Register assignments
|
||||
; x0 = pSrc1 (moving forward one word every outer loop)
|
||||
; x1 = negated word count of pSrc1
|
||||
; x2 = pSrc2 (moving forward one *digit* every inner loop)
|
||||
; x3 = negated digit count of pSrc2 and pDst
|
||||
; x4 = pDst (moving forward one *digit* every inner loop)
|
||||
; x5 = Stored pDst (moving forward one word every outer loop)
|
||||
; x6 = Current word loaded from pSrc1
|
||||
; x8, x9 = Current words loaded in pairs from pSrc2
|
||||
; x10, x11 = Current words loaded in pairs from pDst
|
||||
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
|
||||
; x16 = Stored pSrc2
|
||||
; x17 = Stored negated digit count of pSrc2
|
||||
; Note x13, x14 are reserved in ARM64EC and thus are not used
|
||||
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
|
||||
|
||||
lsl x1, x1, #2 ; Calculate word count
|
||||
|
||||
neg x1, x1 ; negate nWords1
|
||||
neg x3, x3 ; negate nDigits2
|
||||
|
||||
mov x5, x4 ; store pDst
|
||||
mov x16, x2 ; store pSrc2
|
||||
mov x17, x3 ; store -nDigits2 for later
|
||||
|
||||
;
|
||||
; First iteration of main loop (no adding of previous values from pDst)
|
||||
;
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0] ; load the first word from pSrc1
|
||||
|
||||
SymCryptFdefRawMulAsmLoopInner1
|
||||
add x3, x3, #1 ; move one digit up
|
||||
|
||||
ldp x8, x9, [x2] ; load 2 words from pSrc2
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
|
||||
str x12, [x4] ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
|
||||
str x12, [x4, #8] ; Store to destination
|
||||
|
||||
ldp x8, x9, [x2, #16] ; load 2 words from pSrc2
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
|
||||
str x12, [x4, #16] ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+3]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+3]
|
||||
str x12, [x4, #24] ; Store to destination
|
||||
|
||||
add x2, x2, #32
|
||||
add x4, x4, #32
|
||||
|
||||
cbnz x3, SymCryptFdefRawMulAsmLoopInner1
|
||||
|
||||
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
|
||||
str x15, [x4]
|
||||
|
||||
add x1, x1, #1 ; move one word up
|
||||
add x0, x0, #8 ; move start of pSrc1 one word up
|
||||
add x5, x5, #8 ; move start of pDst one word up
|
||||
|
||||
;
|
||||
; MAIN LOOP
|
||||
;
|
||||
SymCryptFdefRawMulAsmLoopOuter
|
||||
mov x3, x17 ; set -nDigits2
|
||||
mov x2, x16 ; set pSrc2
|
||||
mov x4, x5 ; set pDst
|
||||
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0] ; load the next word from pSrc1
|
||||
|
||||
SymCryptFdefRawMulAsmLoopInner
|
||||
add x3, x3, #1 ; move one digit up
|
||||
|
||||
ldp x8, x9, [x2] ; load 2 words from pSrc2
|
||||
ldp x10, x11, [x4] ; load 2 words from pDst
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
|
||||
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4] ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4, #8] ; Store to destination
|
||||
|
||||
ldp x8, x9, [x2, #16] ; load 2 words from pSrc2
|
||||
ldp x10, x11, [x4, #16] ; load 2 words from pDst
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4, #16] ; Store to destination
|
||||
|
||||
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+3]
|
||||
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+3]
|
||||
adc x15, x15, XZR ; Add the carry if any and don't update the flags
|
||||
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
|
||||
str x12, [x4, #24] ; Store to destination
|
||||
|
||||
add x2, x2, #32
|
||||
add x4, x4, #32
|
||||
|
||||
cbnz x3, SymCryptFdefRawMulAsmLoopInner
|
||||
|
||||
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
|
||||
str x15, [x4]
|
||||
|
||||
adds x1, x1, #1 ; move one word up
|
||||
add x0, x0, #8 ; move start of pSrc1 one word up
|
||||
add x5, x5, #8 ; move start of pDst one word up
|
||||
|
||||
bne SymCryptFdefRawMulAsmLoopOuter
|
||||
|
||||
; Done, no return value
|
||||
|
||||
ret
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
; Macro for the first loop of the first pass of RawSquareAsm.
|
||||
; It takes one word from the source, multiplies it with the mulword,
|
||||
; adds the high level word of the previous macro call, and stores it into
|
||||
; the destination.
|
||||
;
|
||||
; No carry flag is propagated from the previous macro call as the maximum is
|
||||
; (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
|
||||
MACRO
|
||||
SQR_SINGLEADD_64 $index
|
||||
|
||||
ldr x8, [x2, #8*$index] ; pSrc[i+j]
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
|
||||
adds x12, x12, x15 ; Adding the previous word
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
|
||||
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
|
||||
|
||||
str x12, [x4, #8*$index] ; Store to destination
|
||||
|
||||
MEND
|
||||
|
||||
; Macro for the remaining loops of the first pass of RawSquareAsm.
|
||||
; The only difference to the above is that it also adds the word loaded
|
||||
; from the destination buffer.
|
||||
;
|
||||
; No carry flag is propagated from the previous macro call as the maximum is
|
||||
; (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
|
||||
MACRO
|
||||
SQR_DOUBLEADD_64 $index
|
||||
|
||||
ldr x8, [x2, #8*$index] ; pSrc[i+j]
|
||||
ldr x10, [x4, #8*$index] ; pDst[2*(i+j)]
|
||||
|
||||
mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
|
||||
adds x12, x12, x15 ; Adding the previous word
|
||||
umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
|
||||
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
|
||||
|
||||
adds x12, x12, x10 ; Add the word from the destination
|
||||
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
|
||||
|
||||
str x12, [x4, #8*$index] ; Store to destination
|
||||
|
||||
MEND
|
||||
|
||||
; Macro for the third pass loop of RawSquareAsm.
|
||||
; It takes one mulword from the source, squares it, and
|
||||
; adds it to the even columns of the destination. The carries are propagated
|
||||
; to the odd columns.
|
||||
;
|
||||
; Here we can have a (1-bit) carry to the next call because the maximum value for
|
||||
; a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
|
||||
MACRO
|
||||
SQR_DIAGONAL_PROP $index
|
||||
ldr x6, [x0, #8*$index] ; mulword
|
||||
mul x12, x6, x6 ; Bits <63:0> of m^2
|
||||
umulh x15, x6, x6 ; Bits <127:64> of m^2
|
||||
|
||||
ldp x8, x9, [x4, #16*$index] ; Load
|
||||
|
||||
; Adding the square to the even column
|
||||
adcs x12, x12, x8 ; carry from previous and update the flags
|
||||
|
||||
; Propagating the sum to the next column
|
||||
adcs x15, x15, x9 ; This can generate a carry
|
||||
|
||||
stp x12, x15, [x4, #16*$index] ; Store
|
||||
MEND
|
||||
|
||||
; VOID
|
||||
; SYMCRYPT_CALL
|
||||
; SymCryptFdefRawSquareAsm(
|
||||
; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
; UINT32 nDigits,
|
||||
; _Out_writes_(2*nWords) PUINT32 pDst )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pSrc -> x0
|
||||
; nDigits -> x1
|
||||
; pDst -> x2
|
||||
;
|
||||
; Register assignments
|
||||
; x0 = pSrc
|
||||
; x1 = negated word count of pSrc
|
||||
; x2 = pSrc (moving forward one digit / 4 words every inner loop)
|
||||
; x3 = negated digit count of pSrc
|
||||
; x4 = pDst (moving forward one digit every inner loop)
|
||||
; x5 = pDst (moving forward one word every outer loop)
|
||||
; x6 = Current word loaded from pSrc
|
||||
; x8, x9 = Current words loaded in pairs from pSrc2
|
||||
; x10, x11 = Current words loaded in pairs from pDst
|
||||
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
|
||||
; x16 = Stored pSrc
|
||||
; x17 = Negated digit count of pSrc
|
||||
; x19 = Stored negated digit count of pSrc
|
||||
; x20 = Stored pDst
|
||||
; Note x13, x14 are reserved in ARM64EC and thus are not used
|
||||
|
||||
|
||||
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
|
||||
PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR
|
||||
PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20
|
||||
|
||||
mov x3, x1 ; digit count into x3
|
||||
|
||||
lsl x1, x1, #2 ; Calculate word count
|
||||
|
||||
neg x1, x1 ; negate nWords
|
||||
neg x3, x3 ; negate nDigits
|
||||
|
||||
mov x4, x2 ; pDst
|
||||
mov x5, x2 ; store pDst
|
||||
mov x20, x2 ; store pDst
|
||||
mov x16, x0 ; store pSrc
|
||||
mov x2, x0 ; inner loop pSrc
|
||||
mov x17, x3 ; store -nDigits for later
|
||||
mov x19, x3 ; store -nDigits for later
|
||||
|
||||
;
|
||||
; First iteration of main loop (no adding of previous values from pDst)
|
||||
;
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0] ; load the first word from pSrc1
|
||||
str x15, [x4] ; store 0 for the first word
|
||||
|
||||
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoopInit_Word0
|
||||
SQR_SINGLEADD_64 0
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
||||
SQR_SINGLEADD_64 1
|
||||
|
||||
SQR_SINGLEADD_64 2
|
||||
|
||||
SQR_SINGLEADD_64 3
|
||||
|
||||
add x3, x3, #1 ; move one digit up
|
||||
add x2, x2, #32
|
||||
add x4, x4, #32
|
||||
|
||||
cbnz x3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
|
||||
|
||||
str x15, [x4] ; Store the next word into the destination
|
||||
|
||||
add x1, x1, #1 ; move one word up
|
||||
|
||||
mov x9, #1 ; Cyclic counter
|
||||
|
||||
;
|
||||
; MAIN LOOP
|
||||
;
|
||||
SymCryptFdefRawSquareAsmOuterLoop
|
||||
|
||||
add x5, x5, #8 ; move start of pDst one word up
|
||||
|
||||
mov x3, x17 ; set -nDigits
|
||||
mov x2, x0 ; set pSrc
|
||||
mov x4, x5 ; set pDst
|
||||
|
||||
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
|
||||
ldr x6, [x0, x9, LSL #3] ; load the next word from pSrc
|
||||
|
||||
; Cyclic counter and jump logic
|
||||
add x9, x9, #1
|
||||
cmp x9, #1
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
|
||||
cmp x9, #2
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
|
||||
cmp x9, #3
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
|
||||
|
||||
; The following instructions are only executed when x9 == 4
|
||||
mov x9, XZR ; Set it to 0
|
||||
|
||||
add x0, x0, #32 ; move start of pSrc 4 words up
|
||||
add x5, x5, #32 ; move pDst 4 words up
|
||||
|
||||
mov x2, x0 ; set pSrc
|
||||
mov x4, x5 ; set pDst
|
||||
|
||||
adds x17, x17, #1 ; add 1 digit
|
||||
mov x3, x17 ; set the new digit counter
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word0
|
||||
SQR_DOUBLEADD_64 0
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word1
|
||||
SQR_DOUBLEADD_64 1
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word2
|
||||
SQR_DOUBLEADD_64 2
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word3
|
||||
SQR_DOUBLEADD_64 3
|
||||
|
||||
add x3, x3, #1 ; move one digit up
|
||||
add x2, x2, #32
|
||||
add x4, x4, #32
|
||||
|
||||
cbnz x3, SymCryptFdefRawSquareAsmInnerLoop_Word0
|
||||
|
||||
str x15, [x4] ; Store the next word into the destination
|
||||
|
||||
adds x1, x1, #1 ; move one word up
|
||||
cmn x1, #1 ; Compare with -1
|
||||
bne SymCryptFdefRawSquareAsmOuterLoop
|
||||
|
||||
ands x15, x15, XZR ; Setting x15 = 0
|
||||
str x15, [x5, #40] ; Store 0 to destination for the top word
|
||||
|
||||
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Second Pass - Shifting all results 1 bit left
|
||||
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
mov x3, x19 ; -nDigits
|
||||
lsl x3, x3, #1 ; Double digits
|
||||
mov x4, x20 ; pDst pointer
|
||||
ands x8, x8, XZR ; Clear the flags
|
||||
|
||||
SymCryptFdefRawSquareAsmSecondPass
|
||||
|
||||
add x3, x3, #1 ; move one digit up
|
||||
|
||||
ldp x8, x9, [x4]
|
||||
adcs x8, x8, x8 ; Shift left and add the carry
|
||||
adcs x9, x9, x9
|
||||
stp x8, x9, [x4], #16
|
||||
|
||||
ldp x10, x11, [x4]
|
||||
adcs x10, x10, x10 ; Shift left and add the carry
|
||||
adcs x11, x11, x11
|
||||
stp x10, x11, [x4], #16
|
||||
|
||||
cbnz x3, SymCryptFdefRawSquareAsmSecondPass
|
||||
|
||||
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; Third Pass - Adding the squares on the even columns and propagating the sum
|
||||
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
ands x8, x8, XZR ; Clear the flags
|
||||
mov x0, x16 ; src pointer
|
||||
mov x4, x20 ; pDst pointer
|
||||
mov x3, x19 ; -nDigits
|
||||
|
||||
SymCryptFdefRawSquareAsmThirdPass
|
||||
SQR_DIAGONAL_PROP 0
|
||||
SQR_DIAGONAL_PROP 1
|
||||
SQR_DIAGONAL_PROP 2
|
||||
SQR_DIAGONAL_PROP 3
|
||||
|
||||
add x3, x3, #1 ; move one digit up
|
||||
add x0, x0, #32 ; One digit up (not updated in SQR_DIAGONAL_PROP)
|
||||
add x4, x4, #64 ; Two digits up (not updated in SQR_DIAGONAL_PROP)
|
||||
|
||||
cbnz x3, SymCryptFdefRawSquareAsmThirdPass
|
||||
|
||||
; Done, no return value
|
||||
|
||||
EPILOG_RESTORE_REG_PAIR x19, x20, #16
|
||||
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
|
||||
EPILOG_RETURN
|
||||
|
||||
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
|
||||
|
||||
;VOID
|
||||
;SymCryptFdefMontgomeryReduceAsm(
|
||||
; _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
; _In_ PUINT32 pSrc,
|
||||
; _Out_ PUINT32 pDst )
|
||||
;
|
||||
; Initial inputs to registers:
|
||||
; pmMod -> x0
|
||||
; pSrc -> x1
|
||||
; pDst -> x2
|
||||
;
|
||||
; Register assignments
|
||||
; x0 = pMod (moving forward one *digit* every inner loop)
|
||||
; x1 = pSrc (moving forward one *digit* every inner loop)
|
||||
; x2 = pDst (used only in the end for subtract / result)
|
||||
; x3 = negated digit count of pSrc and pMod
|
||||
; x4 = negated word count of pSrc
|
||||
; x5 = Inv64 of the modulus
|
||||
; x6 = m = pSrc[i]*Inv64
|
||||
; x7 = hc = high carry variable
|
||||
; x8, x9 = Current words loaded in pairs from pSrc
|
||||
; x10, x11 = Current words loaded in pairs from pMod
|
||||
; x12, x15 = c variable = "128-bit" register to hold the result of multiplies
|
||||
; It is flipped between [x12:x15] and [x15:x12] intstead of doing c>>=64
|
||||
; x16 = Temporary intermediate result
|
||||
; x17 = Stored negated digit count of pSrc
|
||||
; x19 = Stored pMod pointer
|
||||
; x20 = Stored pSrc pointer (moving forward one word every outer loop)
|
||||
; Note x13, x14 are reserved in ARM64EC and thus are not used
|
||||
|
||||
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
|
||||
PROLOG_SAVE_REG_PAIR fp, lr, #-32!
|
||||
PROLOG_SAVE_REG_PAIR x19, x20, #16
|
||||
|
||||
ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits
|
||||
ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus
|
||||
add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod
|
||||
|
||||
lsl x4, x3, #2 ; Multiply by 4 to get the number of words
|
||||
|
||||
neg x3, x3 ; Negate the digit count
|
||||
neg x4, x4 ; Negate the word count
|
||||
|
||||
mov x17, x3 ; Store the digit count for later
|
||||
mov x19, x0 ; Store the pMod pointer
|
||||
mov x20, x1 ; Store the pSrc pointer
|
||||
|
||||
ands x7, x7, XZR ; Set hc to 0
|
||||
|
||||
;
|
||||
; Main loop
|
||||
;
|
||||
SymCryptFdefMontgomeryReduceAsmOuter
|
||||
ldr x8, [x1] ; Load 1 word from pSrc
|
||||
mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m
|
||||
|
||||
ands x12, x12, XZR ; Set c to 0
|
||||
ands x15, x15, XZR ; Set c to 0
|
||||
|
||||
SymCryptFdefMontgomeryReduceAsmInner
|
||||
ldp x10, x11, [x0] ; pMod[j]
|
||||
ldp x8, x9, [x1] ; pSrc[j]
|
||||
|
||||
mul x16, x6, x10 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x8 ; Adding pSrc[j]
|
||||
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
adds x12, x12, x16 ; Add the lower bits of c
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
; ***: These cannot produce extra carry as the maximum is
|
||||
; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
|
||||
str x12, [x1] ; pSrc[j] = (UINT64) c
|
||||
|
||||
mul x16, x6, x11 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x9 ; Adding pSrc[j]
|
||||
umulh x12, x6, x11 ; <127:64> of pMod[j]*m
|
||||
adc x12, x12, XZR ; Add the carry if any (***)
|
||||
adds x15, x15, x16 ; Add the lower bits of c
|
||||
adc x12, x12, XZR ; Add the carry if any (***)
|
||||
str x15, [x1, #8] ; pSrc[j] = (UINT64) c
|
||||
|
||||
ldp x10, x11, [x0, #16] ; pMod[j]
|
||||
ldp x8, x9, [x1, #16] ; pSrc[j]
|
||||
|
||||
mul x16, x6, x10 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x8 ; Adding pSrc[j]
|
||||
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
adds x12, x12, x16 ; Add the lower bits of c
|
||||
adc x15, x15, XZR ; Add the carry if any (***)
|
||||
str x12, [x1, #16] ; pSrc[j] = (UINT64) c
|
||||
|
||||
mul x16, x6, x11 ; <63:0> of pMod[j]*m
|
||||
adds x16, x16, x9 ; Adding pSrc[j]
|
||||
umulh x12, x6, x11 ; <127:64> of pMod[j]*m
|
||||
adc x12, x12, XZR ; Add the carry if any (***)
|
||||
adds x15, x15, x16 ; Add the lower bits of c
|
||||
adc x12, x12, XZR ; Add the carry if any (***)
|
||||
str x15, [x1, #24] ; pSrc[j] = (UINT64) c
|
||||
|
||||
add x0, x0, #32
|
||||
add x1, x1, #32
|
||||
adds x3, x3, #1 ; Move one digit up
|
||||
bne SymCryptFdefMontgomeryReduceAsmInner
|
||||
|
||||
ldr x8, [x1] ; pSrc[nWords]
|
||||
adds x12, x12, x8 ; c + pSrc[nWords]
|
||||
adc x15, XZR, XZR ; Add the carry if any
|
||||
|
||||
adds x12, x12, x7 ; c + pSrc[nWords] + hc
|
||||
adc x7, x15, XZR ; Add the carry if any and store into hc
|
||||
|
||||
str x12, [x1] ; pSrc[nWords] = c
|
||||
|
||||
adds x4, x4, #1 ; Move one word up
|
||||
|
||||
add x20, x20, #8 ; Move stored pSrc pointer one word up
|
||||
mov x0, x19 ; Restore pMod pointer
|
||||
mov x1, x20 ; Restore pSrc pointer
|
||||
|
||||
mov x3, x17 ; Restore the digit counter
|
||||
|
||||
bne SymCryptFdefMontgomeryReduceAsmOuter
|
||||
|
||||
;
|
||||
; Subtraction
|
||||
;
|
||||
|
||||
mov x16, x2 ; Store pDst pointer
|
||||
|
||||
; Prepare the pointers for subtract
|
||||
mov x0, x20 ; pSrc
|
||||
mov x1, x19 ; pMod
|
||||
|
||||
mov x10, x7 ; x10 = hc
|
||||
mov x3, x17 ; Restore the digit counter
|
||||
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
|
||||
|
||||
SymCryptFdefMontgomeryReduceRawSubAsmLoop
|
||||
add x3, x3, #1 ; Increment the digit count by one
|
||||
; borrow is in the carry flag (flipped)
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
|
||||
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
|
||||
sbcs x4, x4, x5
|
||||
sbcs x6, x6, x7
|
||||
stp x4, x6, [x2], #16 ; Store the result in the destination
|
||||
|
||||
cbnz x3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
|
||||
|
||||
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
orr x11, x10, x0 ; x11 = hc|d
|
||||
|
||||
; Prepare the pointers for masked copy
|
||||
mov x0, x20 ; pSrc
|
||||
mov x1, x16 ; pDst
|
||||
|
||||
mov x2, x17 ; Restore the digit counter
|
||||
subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow)
|
||||
|
||||
SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
|
||||
add x2, x2, #1 ; Increment the digit count by one
|
||||
|
||||
ldp x4, x6, [x0], #16 ; Load two words of the source
|
||||
ldp x5, x7, [x1] ; Load two words of the destination
|
||||
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16 ; Store the two words in the destination
|
||||
|
||||
ldp x4, x6, [x0], #16
|
||||
ldp x5, x7, [x1]
|
||||
cselcc x4, x4, x5
|
||||
cselcc x6, x6, x7
|
||||
stp x4, x6, [x1], #16
|
||||
|
||||
cbnz x2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
|
||||
|
||||
; Done, no return value
|
||||
|
||||
EPILOG_RESTORE_REG_PAIR x19, x20, #16
|
||||
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
|
||||
EPILOG_RETURN
|
||||
|
||||
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
|
||||
|
||||
END
|
||||
|
|
@ -0,0 +1,705 @@
|
|||
//
|
||||
// fdef_asm.symcryptasm Assembler code for large integer arithmetic in the default data format for the arm64 architecture
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
// A digit consists of 4 words of 64 bits each
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawAddAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
// UINT32 nDigits )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm), 4, 8)
|
||||
|
||||
ldp X_4, X_6, [X_0] // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1] // Load two words of pSrc2
|
||||
adds X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2] // Store the result in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbz X_3, SymCryptFdefRawAddAsmEnd
|
||||
|
||||
LABEL(SymCryptFdefRawAddAsmLoop)
|
||||
// carry is in the carry flag
|
||||
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
|
||||
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
||||
adcs X_4, X_4, X_5
|
||||
adcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdefRawAddAsmLoop
|
||||
|
||||
ALIGN(4)
|
||||
LABEL(SymCryptFdefRawAddAsmEnd)
|
||||
cset X_0, cs // Set the return value equal to the carry
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm))
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawSubAsm(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
// UINT32 nDigits )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm), 4, 8)
|
||||
|
||||
ldp X_4, X_6, [X_0] // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1] // Load two words of pSrc2
|
||||
subs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2] // Store the result in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbz X_3, SymCryptFdefRawSubAsmEnd
|
||||
|
||||
LABEL(SymCryptFdefRawSubAsmLoop)
|
||||
// borrow is in the carry flag (flipped)
|
||||
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
|
||||
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdefRawSubAsmLoop
|
||||
|
||||
ALIGN(4)
|
||||
LABEL(SymCryptFdefRawSubAsmEnd)
|
||||
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefMaskedCopyAsm(
|
||||
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
// UINT32 nDigits,
|
||||
// UINT32 mask )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm), 4, 4)
|
||||
|
||||
dup v0.4s, W_3 // broadcast the mask to v0
|
||||
|
||||
LABEL(SymCryptFdefMaskedCopyAsmLoop)
|
||||
ldp q1, q3, [X_0], #32 // Load 4 words of the source
|
||||
ldp q2, q4, [X_1] // Load 4 words of the destination
|
||||
bit v2.16b, v1.16b, v0.16b // if the mask is 1s, overwrite the destination with source
|
||||
bit v4.16b, v3.16b, v0.16b // if the mask is 1s, overwrite the destination with source
|
||||
stp q2, q4, [X_1], #32 // Store the two words in the destination
|
||||
|
||||
sub X_2, X_2, #1 // Decrement the digit count by one
|
||||
|
||||
cbnz X_2, SymCryptFdefMaskedCopyAsmLoop
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawMulAsm(
|
||||
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
// UINT32 nDigits1,
|
||||
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
// UINT32 nDigits2,
|
||||
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
//
|
||||
// Basic structure:
|
||||
// for each word in Src1:
|
||||
// Dst += Src2 * word
|
||||
//
|
||||
// Register assignments
|
||||
// X_0 = pSrc1 (moving forward one word every outer loop)
|
||||
// X_1 = word count of pSrc1
|
||||
// X_2 = pSrc2 (moving forward one *digit* every inner loop)
|
||||
// X_3 = digit count of pSrc2 and pDst
|
||||
// X_4 = pDst (moving forward one *digit* every inner loop)
|
||||
// X_5 = Stored pDst (moving forward one word every outer loop)
|
||||
// X_6 = Current word loaded from pSrc1
|
||||
// X_7, X_8 = Current words loaded in pairs from pSrc2
|
||||
// X_9, X_10 = Current words loaded in pairs from pDst
|
||||
// X_11, X_12 = Scratch registers for holding the results of multiplies
|
||||
// X_13 = Stored pSrc2
|
||||
// X_14 = Stored digit count of pSrc2
|
||||
// X_15 = Scratch register for holding the results of multiplies
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm), 5, 16)
|
||||
|
||||
lsl X_1, X_1, #2 // Calculate word count
|
||||
|
||||
sub X_2, X_2, #32 // offset pSrc2 so we can use pre-increment form of loads
|
||||
sub X_4, X_4, #32 // offset pDst so we can use pre-increment form of loads
|
||||
|
||||
mov X_5, X_4 // store pDst
|
||||
mov X_13, X_2 // store pSrc2
|
||||
mov X_14, X_3 // store nDigits2 for later
|
||||
|
||||
//
|
||||
// First iteration of main loop (no adding of previous values from pDst)
|
||||
//
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0] // load the first word from pSrc1
|
||||
|
||||
LABEL(SymCryptFdefRawMulAsmLoopInner1)
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
|
||||
ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j]
|
||||
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j]
|
||||
|
||||
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
|
||||
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
|
||||
|
||||
stp X_11, X_15, [X_4, #32]! // Store to destination
|
||||
ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
|
||||
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
|
||||
|
||||
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+3]
|
||||
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+3]
|
||||
|
||||
stp X_11, X_15, [X_4, #16] // Store to destination
|
||||
|
||||
cbnz X_3, SymCryptFdefRawMulAsmLoopInner1
|
||||
|
||||
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
|
||||
str X_12, [X_4, #32]
|
||||
|
||||
sub X_1, X_1, #1 // move one word up
|
||||
add X_0, X_0, #8 // move start of pSrc1 one word up
|
||||
add X_5, X_5, #8 // move start of pDst one word up
|
||||
|
||||
//
|
||||
// MAIN LOOP
|
||||
//
|
||||
LABEL(SymCryptFdefRawMulAsmLoopOuter)
|
||||
mov X_3, X_14 // set nDigits2
|
||||
mov X_2, X_13 // set pSrc2
|
||||
mov X_4, X_5 // set pDst
|
||||
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0] // load the next word from pSrc1
|
||||
|
||||
LABEL(SymCryptFdefRawMulAsmLoopInner)
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
|
||||
ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2
|
||||
ldp X_9, X_10, [X_4, #32]! // load 2 words from pDst
|
||||
|
||||
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j]
|
||||
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
|
||||
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j]
|
||||
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
|
||||
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
|
||||
stp X_9, X_10, [X_4] // Store to destination
|
||||
|
||||
ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2
|
||||
ldp X_9, X_10, [X_4, #16] // load 2 words from pDst
|
||||
|
||||
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
|
||||
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
|
||||
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+3]
|
||||
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
|
||||
|
||||
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
|
||||
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+3]
|
||||
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
|
||||
|
||||
stp X_9, X_10, [X_4, #16] // Store to destination
|
||||
|
||||
cbnz X_3, SymCryptFdefRawMulAsmLoopInner
|
||||
|
||||
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
|
||||
str X_12, [X_4, #32]
|
||||
|
||||
subs X_1, X_1, #1 // move one word up
|
||||
add X_0, X_0, #8 // move start of pSrc1 one word up
|
||||
add X_5, X_5, #8 // move start of pDst one word up
|
||||
|
||||
bne SymCryptFdefRawMulAsmLoopOuter
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm))
|
||||
|
||||
// Macro for the first loop of the first pass of RawSquareAsm.
|
||||
// It takes one word from the source, multiplies it with the mulword,
|
||||
// adds the high level word of the previous macro call, and stores it into
|
||||
// the destination.
|
||||
//
|
||||
// No carry flag is propagated from the previous macro call as the maximum is
|
||||
// (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
|
||||
MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1)
|
||||
|
||||
ldr scratch0, [src_reg, #8*index] // pSrc[i+j]
|
||||
|
||||
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
|
||||
adds scratch1, scratch1, src_carry // Adding the previous word
|
||||
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
|
||||
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
||||
|
||||
str scratch1, [dst_reg, #8*index] // Store to destination
|
||||
|
||||
MACRO_END()
|
||||
|
||||
// Macro for the remaining loops of the first pass of RawSquareAsm.
|
||||
// The only difference to the above is that it also adds the word loaded
|
||||
// from the destination buffer.
|
||||
//
|
||||
// No carry flag is propagated from the previous macro call as the maximum is
|
||||
// (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
|
||||
MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1, scratch2)
|
||||
|
||||
ldr scratch0, [src_reg, #8*index] // pSrc[i+j]
|
||||
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
|
||||
|
||||
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
|
||||
adds scratch1, scratch1, src_carry // Adding the previous word
|
||||
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
|
||||
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
||||
|
||||
adds scratch1, scratch1, scratch2 // Add the word from the destination
|
||||
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
|
||||
|
||||
str scratch1, [dst_reg, #8*index] // Store to destination
|
||||
|
||||
MACRO_END()
|
||||
|
||||
// Macro for the third pass loop of RawSquareAsm.
|
||||
// It takes one mulword from the source, squares it, and
|
||||
// adds it to the even columns of the destination. The carries are propagated
|
||||
// to the odd columns.
|
||||
//
|
||||
// Here we can have a (1-bit) carry to the next call because the maximum value for
|
||||
// a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
|
||||
MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, squarelo, squarehi, scratch0, scratch1)
|
||||
|
||||
ldr squarehi, [src_reg, #8*index] // mulword
|
||||
mul squarelo, squarehi, squarehi // Bits <63:0> of m^2
|
||||
umulh squarehi, squarehi, squarehi // Bits <127:64> of m^2
|
||||
|
||||
ldp scratch0, scratch1, [dst_reg, #16*index] // Load
|
||||
|
||||
// Adding the square to the even column
|
||||
adcs squarelo, squarelo, scratch0 // carry from previous and update the flags
|
||||
|
||||
// Propagating the sum to the next column
|
||||
adcs squarehi, squarehi, scratch1 // This can generate a carry
|
||||
|
||||
stp squarelo, squarehi, [dst_reg, #16*index] // Store
|
||||
|
||||
MACRO_END()
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawSquareAsm(
|
||||
// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
// UINT32 nDigits,
|
||||
// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
|
||||
//
|
||||
// Register assignments
|
||||
// X_0 = pSrc
|
||||
// X_1 = word count of pSrc
|
||||
// X_2 = pSrc (moving forward one digit / 4 words every inner loop)
|
||||
// X_3 = digit count of pSrc
|
||||
// X_4 = pDst (moving forward one digit every inner loop)
|
||||
// X_5 = pDst (moving forward one word every outer loop)
|
||||
// X_6 = Current word loaded from pSrc
|
||||
// X_7, X_8 = Current words loaded in pairs from pSrc2
|
||||
// X_9, X_10 = Current words loaded in pairs from pDst
|
||||
// X_11, X_12 = "128-bit" sliding register to hold the result of multiplies
|
||||
// X_13 = Stored pSrc
|
||||
// X_14 = Digit count of pSrc
|
||||
// X_15 = Stored digit count of pSrc
|
||||
// X_16 = Stored pDst
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
|
||||
|
||||
mov X_3, X_1 // digit count into X_3
|
||||
|
||||
lsl X_1, X_1, #2 // Calculate word count
|
||||
|
||||
mov X_4, X_2 // pDst
|
||||
mov X_5, X_2 // store pDst
|
||||
mov X_16, X_2 // store pDst
|
||||
mov X_13, X_0 // store pSrc
|
||||
mov X_2, X_0 // inner loop pSrc
|
||||
mov X_14, X_3 // store nDigits for later
|
||||
mov X_15, X_3 // store nDigits for later
|
||||
|
||||
//
|
||||
// First iteration of main loop (no adding of previous values from pDst)
|
||||
//
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0] // load the first word from pSrc1
|
||||
str X_12, [X_4] // store 0 for the first word
|
||||
|
||||
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word0)
|
||||
SQR_SINGLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_8
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word1)
|
||||
SQR_SINGLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_8
|
||||
|
||||
SQR_SINGLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_8
|
||||
|
||||
SQR_SINGLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_8
|
||||
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
add X_2, X_2, #32
|
||||
add X_4, X_4, #32
|
||||
|
||||
cbnz X_3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
|
||||
|
||||
str X_12, [X_4] // Store the next word into the destination
|
||||
|
||||
sub X_1, X_1, #2 // move two words up (we started at the word 1)
|
||||
|
||||
mov X_8, #1 // Cyclic counter
|
||||
|
||||
//
|
||||
// MAIN LOOP
|
||||
//
|
||||
LABEL(SymCryptFdefRawSquareAsmOuterLoop)
|
||||
|
||||
add X_5, X_5, #8 // move start of pDst one word up
|
||||
|
||||
mov X_3, X_14 // set nDigits
|
||||
mov X_2, X_0 // set pSrc
|
||||
mov X_4, X_5 // set pDst
|
||||
|
||||
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
|
||||
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
|
||||
|
||||
// Cyclic counter and jump logic
|
||||
add X_8, X_8, #1
|
||||
cmp X_8, #1
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
|
||||
cmp X_8, #2
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
|
||||
cmp X_8, #3
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
|
||||
|
||||
// The following instructions are only executed when X_8 == 4
|
||||
mov X_8, xzr // Set it to 0
|
||||
|
||||
add X_0, X_0, #32 // move start of pSrc 4 words up
|
||||
add X_5, X_5, #32 // move pDst 4 words up
|
||||
|
||||
mov X_2, X_0 // set pSrc
|
||||
mov X_4, X_5 // set pDst
|
||||
|
||||
sub X_14, X_14, #1 // remove 1 digit
|
||||
mov X_3, X_14 // set the new digit counter
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
|
||||
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
|
||||
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
|
||||
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
|
||||
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
|
||||
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
add X_2, X_2, #32
|
||||
add X_4, X_4, #32
|
||||
|
||||
cbnz X_3, SymCryptFdefRawSquareAsmInnerLoop_Word0
|
||||
|
||||
str X_12, [X_4] // Store the next word into the destination
|
||||
|
||||
sub X_1, X_1, #1 // move one word up
|
||||
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
|
||||
|
||||
ands X_12, X_12, xzr // Setting X_12 = 0
|
||||
str X_12, [X_5, #40] // Store 0 to destination for the top word
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Second Pass - Shifting all results 1 bit left
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
mov X_3, X_15 // nDigits
|
||||
lsl X_3, X_3, #1 // Double digits
|
||||
mov X_4, X_16 // pDst pointer
|
||||
ands X_7, X_7, xzr // Clear the flags
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmSecondPass)
|
||||
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
|
||||
ldp X_7, X_8, [X_4]
|
||||
adcs X_7, X_7, X_7 // Shift left and add the carry
|
||||
adcs X_8, X_8, X_8
|
||||
stp X_7, X_8, [X_4], #16
|
||||
|
||||
ldp X_9, X_10, [X_4]
|
||||
adcs X_9, X_9, X_9 // Shift left and add the carry
|
||||
adcs X_10, X_10, X_10
|
||||
stp X_9, X_10, [X_4], #16
|
||||
|
||||
cbnz X_3, SymCryptFdefRawSquareAsmSecondPass
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// Third Pass - Adding the squares on the even columns and propagating the sum
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
ands X_7, X_7, xzr // Clear the flags
|
||||
mov X_0, X_13 // src pointer
|
||||
mov X_4, X_16 // pDst pointer
|
||||
mov X_3, X_15 // nDigits
|
||||
|
||||
LABEL(SymCryptFdefRawSquareAsmThirdPass)
|
||||
SQR_DIAGONAL_PROP 0, X_0, X_4, X_6, X_7, X_8, X_9
|
||||
SQR_DIAGONAL_PROP 1, X_0, X_4, X_6, X_7, X_8, X_9
|
||||
SQR_DIAGONAL_PROP 2, X_0, X_4, X_6, X_7, X_8, X_9
|
||||
SQR_DIAGONAL_PROP 3, X_0, X_4, X_6, X_7, X_8, X_9
|
||||
|
||||
sub X_3, X_3, #1 // move one digit up
|
||||
add X_0, X_0, #32 // One digit up (not updated in SQR_DIAGONAL_PROP)
|
||||
add X_4, X_4, #64 // Two digits up (not updated in SQR_DIAGONAL_PROP)
|
||||
|
||||
cbnz X_3, SymCryptFdefRawSquareAsmThirdPass
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefMontgomeryReduceAsm(
|
||||
// _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
// _Inout_ PUINT32 pSrc,
|
||||
// _Out_ PUINT32 pDst )
|
||||
//
|
||||
// Register assignments
|
||||
// X_0 = pMod (moving forward one *digit* every inner loop)
|
||||
// X_1 = pSrc (moving forward one *digit* every inner loop)
|
||||
// X_2 = pDst (used only in the end for subtract / result)
|
||||
// X_3 = digit count of pSrc and pMod
|
||||
// X_4 = word count of pSrc
|
||||
// X_5 = Inv64 of the modulus
|
||||
// X_6 = m = pSrc[i]*Inv64
|
||||
// X_7 = hc = high carry variable
|
||||
// X_8, X_9 = Current words loaded in pairs from pSrc
|
||||
// X_10, X_11 = Current words loaded in pairs from pMod
|
||||
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
|
||||
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
|
||||
// X_14 = Temporary intermediate result
|
||||
// X_15 = Stored digit count of pSrc
|
||||
// X_16 = Stored pMod pointer
|
||||
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
|
||||
|
||||
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
|
||||
ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus
|
||||
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
|
||||
|
||||
lsl X_4, X_3, #2 // Multiply by 4 to get the number of words
|
||||
|
||||
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
|
||||
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
|
||||
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
|
||||
|
||||
mov X_15, X_3 // Store the digit count for later
|
||||
mov X_16, X_0 // Store the pMod pointer
|
||||
mov X_17, X_1 // Store the pSrc pointer
|
||||
|
||||
and X_7, X_7, xzr // Set hc to 0
|
||||
|
||||
//
|
||||
// Main loop
|
||||
//
|
||||
LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
|
||||
ldr X_8, [X_1, #32] // Load 1 word from pSrc
|
||||
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
|
||||
|
||||
and X_12, X_12, xzr // Set c to 0
|
||||
|
||||
LABEL(SymCryptFdefMontgomeryReduceAsmInner)
|
||||
ldp X_10, X_11, [X_0, #32]! // pMod[j]
|
||||
ldp X_8, X_9, [X_1, #32]! // pSrc[j]
|
||||
|
||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
// ***: These cannot produce extra carry as the maximum is
|
||||
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
|
||||
str X_12, [X_1] // pSrc[j] = (UINT64) c
|
||||
|
||||
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_9 // Adding pSrc[j]
|
||||
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
|
||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
||||
adds X_13, X_13, X_14 // Add the lower bits of c
|
||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
||||
str X_13, [X_1, #8] // pSrc[j] = (UINT64) c
|
||||
|
||||
ldp X_10, X_11, [X_0, #16] // pMod[j]
|
||||
ldp X_8, X_9, [X_1, #16] // pSrc[j]
|
||||
|
||||
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_8 // Adding pSrc[j]
|
||||
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
adds X_12, X_12, X_14 // Add the lower bits of c
|
||||
adc X_13, X_13, xzr // Add the carry if any (***)
|
||||
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c
|
||||
|
||||
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
|
||||
adds X_14, X_14, X_9 // Adding pSrc[j]
|
||||
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
|
||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
||||
adds X_13, X_13, X_14 // Add the lower bits of c
|
||||
adc X_12, X_12, xzr // Add the carry if any (***)
|
||||
str X_13, [X_1, #24] // pSrc[j] = (UINT64) c
|
||||
|
||||
subs X_3, X_3, #1 // Move one digit up
|
||||
bne SymCryptFdefMontgomeryReduceAsmInner
|
||||
|
||||
ldr X_8, [X_1, #32] // pSrc[nWords]
|
||||
adds X_12, X_12, X_8 // c + pSrc[nWords]
|
||||
adc X_13, xzr, xzr // Add the carry if any
|
||||
|
||||
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
|
||||
adc X_7, X_13, xzr // Add the carry if any and store into hc
|
||||
|
||||
str X_12, [X_1, #32] // pSrc[nWords] = c
|
||||
|
||||
subs X_4, X_4, #1 // Move one word up
|
||||
|
||||
add X_17, X_17, #8 // Move stored pSrc pointer one word up
|
||||
mov X_0, X_16 // Restore pMod pointer
|
||||
mov X_1, X_17 // Restore pSrc pointer
|
||||
|
||||
mov X_3, X_15 // Restore the digit counter
|
||||
|
||||
bne SymCryptFdefMontgomeryReduceAsmOuter
|
||||
|
||||
//
|
||||
// Subtraction
|
||||
//
|
||||
|
||||
mov X_14, X_2 // Store pDst pointer
|
||||
|
||||
// Prepare the pointers for subtract
|
||||
mov X_0, X_17 // pSrc
|
||||
mov X_1, X_16 // pMod
|
||||
|
||||
mov X_10, X_7 // X_10 = hc
|
||||
mov X_3, X_15 // Restore the digit counter
|
||||
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
|
||||
|
||||
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
|
||||
sub X_3, X_3, #1 // Decrement the digit count by one
|
||||
// borrow is in the carry flag (flipped)
|
||||
|
||||
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
|
||||
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
|
||||
sbcs X_4, X_4, X_5
|
||||
sbcs X_6, X_6, X_7
|
||||
stp X_4, X_6, [X_2, #16] // Store the result in the destination
|
||||
|
||||
cbnz X_3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
|
||||
|
||||
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
|
||||
|
||||
orr X_11, X_10, X_0 // X_11 = hc|d
|
||||
|
||||
// Prepare the pointers for masked copy
|
||||
mov X_0, X_17 // pSrc
|
||||
mov X_1, X_14 // pDst
|
||||
|
||||
mov X_2, X_15 // Restore the digit counter
|
||||
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
|
||||
|
||||
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
|
||||
sub X_2, X_2, #1 // decrement the digit count by one
|
||||
|
||||
ldp X_4, X_6, [X_0, #32]! // Load two words of the source
|
||||
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination
|
||||
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
|
||||
csel X_6, X_6, X_7, cc
|
||||
stp X_4, X_6, [X_1] // Store the two words in the destination
|
||||
|
||||
ldp X_4, X_6, [X_0, #16]
|
||||
ldp X_5, X_7, [X_1, #16]
|
||||
csel X_4, X_4, X_5, cc
|
||||
csel X_6, X_6, X_7, cc
|
||||
stp X_4, X_6, [X_1, #16]
|
||||
|
||||
cbnz X_2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
|
||||
|
||||
// Done, no return value
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm))
|
||||
|
||||
FILE_END()
|
|
@ -1,28 +0,0 @@
|
|||
;
|
||||
; SymCrypt_magic.inc
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
; Include file to define the support macros for the Magic field
|
||||
;
|
||||
|
||||
IMPORT ARM64EC_NAME_MANGLE(SymCryptFatal)
|
||||
|
||||
#define SYMCRYPT_CODE_VERSION (SYMCRYPT_CODE_VERSION_API * 65536 + SYMCRYPT_CODE_VERSION_MINOR)
|
||||
#define SYMCRYPT_MAGIC_CONSTANT (0x53316d76 + SYMCRYPT_CODE_VERSION)
|
||||
|
||||
MACRO
|
||||
SYMCRYPT_CHECK_MAGIC $temp1, $temp2, $ptr, $offset
|
||||
|
||||
#if SYMCRYPT_DEBUG
|
||||
|
||||
ldr $temp1, [$ptr, #$offset]
|
||||
subs $temp1, $temp1, $ptr
|
||||
mov32 $temp2, SYMCRYPT_MAGIC_CONSTANT
|
||||
cmp $temp1, $temp2
|
||||
beq %F1
|
||||
mov32 r0, 0x6d616763 ; 'magc'
|
||||
bl ARM64EC_NAME_MANGLE(SymCryptFatal)
|
||||
1
|
||||
#endif
|
||||
|
||||
MEND
|
|
@ -1,37 +0,0 @@
|
|||
TTL "SymCryptWipe"
|
||||
;++
|
||||
;
|
||||
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
;
|
||||
; Secure wipe
|
||||
;
|
||||
;--
|
||||
|
||||
#include "ksarm64.h"
|
||||
#include "symcrypt_name_mangling.inc"
|
||||
|
||||
TEXTAREA
|
||||
|
||||
EXTERN ARM64EC_NAME_MANGLE(memset)
|
||||
|
||||
SUBT "SymCryptWipe"
|
||||
;VOID
|
||||
;SYMCRYPT_CALL
|
||||
;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
|
||||
; SIZE_T cbData )
|
||||
|
||||
|
||||
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
|
||||
|
||||
; we just jump to memset.
|
||||
; this is enough to stop the compiler optimizing the memset away.
|
||||
|
||||
mov x2, x1
|
||||
mov x1, #0
|
||||
b ARM64EC_NAME_MANGLE(memset)
|
||||
|
||||
LEAF_END ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
|
||||
|
||||
|
||||
|
||||
END
|
|
@ -0,0 +1,31 @@
|
|||
//
|
||||
// wipe.symcryptasm Assembler code for wiping a buffer
|
||||
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
|
||||
// symcryptasm_processor.py script and C preprocessor
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
TEXTAREA()
|
||||
|
||||
EXTERN(ARM64EC_NAME_MANGLE(memset))
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
|
||||
// SIZE_T cbData )
|
||||
|
||||
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptWipeAsm), 2, 3)
|
||||
|
||||
// we just jump to memset.
|
||||
// this is enough to stop the compiler optimizing the memset away.
|
||||
|
||||
mov X_2, X_1
|
||||
mov X_1, #0
|
||||
b ARM64EC_NAME_MANGLE(memset)
|
||||
|
||||
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptWipeAsm))
|
||||
|
||||
FILE_END()
|
|
@ -1 +0,0 @@
|
|||
#include "..\arm64\fdef369_asm.asm"
|
|
@ -1 +0,0 @@
|
|||
#include "..\arm64\fdef_asm.asm"
|
|
@ -1 +0,0 @@
|
|||
#include "..\arm64\symcrypt_magic.inc"
|
|
@ -1 +0,0 @@
|
|||
#include "..\arm64\symcrypt_name_mangling.inc"
|
|
@ -1 +0,0 @@
|
|||
#include "..\arm64\wipe.asm"
|
|
@ -1,132 +0,0 @@
|
|||
//
|
||||
// asmstubs.c
|
||||
// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM on Arm64
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
#include "../precomp.h"
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
|
||||
{
|
||||
volatile BYTE * p = (volatile BYTE *) pbData;
|
||||
SIZE_T i;
|
||||
|
||||
for( i=0; i<cbData; i++ ){
|
||||
p[i] = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMaskedCopyC(
|
||||
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
UINT32 nDigits,
|
||||
UINT32 mask );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMaskedCopyAsm(
|
||||
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
UINT32 nDigits,
|
||||
UINT32 mask )
|
||||
{
|
||||
SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
|
||||
}
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawAddC(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawAddAsm(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
|
||||
}
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSubC(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits );
|
||||
|
||||
UINT32
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSubAsm(
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
UINT32 nDigits )
|
||||
{
|
||||
return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulC(
|
||||
_In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawMulAsm(
|
||||
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
|
||||
UINT32 nDigits1,
|
||||
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
|
||||
UINT32 nDigits2,
|
||||
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareC(
|
||||
_In_reads_(nWords) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefRawSquareAsm(
|
||||
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
UINT32 nDigits,
|
||||
_Out_writes_(2*nWords) PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
|
||||
}
|
||||
|
||||
VOID
|
||||
SymCryptFdefMontgomeryReduceC(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst );
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptFdefMontgomeryReduceAsm(
|
||||
_In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
_In_ PUINT32 pSrc,
|
||||
_Out_ PUINT32 pDst )
|
||||
{
|
||||
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
|
||||
}
|
|
@ -4,9 +4,18 @@
|
|||
|
||||
# Preprocess amd64 .symcryptasm into masm
|
||||
{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
|
||||
..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
|
||||
..\scripts\symcryptasm_processor.py masm amd64 msft $< $(OBJ_PATH)\$(O)\$(<B).cppasm
|
||||
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
|
||||
|
||||
# Preprocess x86 .cppasm into masm
|
||||
{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
|
||||
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
|
||||
|
||||
|
||||
# Preprocess arm64 .symcryptasm into masm
|
||||
{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\}.asm:
|
||||
..\scripts\symcryptasm_processor.py armasm64 arm64 aapcs64 $< $(OBJ_PATH)\$(O)\$(<B).asm
|
||||
|
||||
# Preprocess arm64ec .symcryptasm into masm
|
||||
{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\arm64ec\}.asm:
|
||||
..\scripts\symcryptasm_processor.py armasm64 arm64 arm64ec $< $(OBJ_PATH)\$(O)\arm64ec\$(<B).asm
|
||||
|
|
18
lib/sources
18
lib/sources
|
@ -18,8 +18,13 @@ GUARD = 1 # enable CFG
|
|||
ENABLE_ASM_RETPOLINE = 1
|
||||
ENABLE_RETPOLINE_LINKER_WARNING = 1
|
||||
|
||||
# Enable /Gy for all assembler code
|
||||
ASM_DEFINES=$(ASM_DEFINES) /Gy
|
||||
# Enable /Gy for all assembler code, and some additional symcryptasm definitions for Arm64 assembler code
|
||||
ASM_DEFINES=\
|
||||
!IF "$(_BUILDARCH)" == "arm64"
|
||||
$(ASM_DEFINES) /Gy /DSYMCRYPT_CPU_ARM64 /DSYMCRYPT_MASM
|
||||
!ELSE
|
||||
$(ASM_DEFINES) /Gy
|
||||
!ENDIF
|
||||
|
||||
USE_MAKEFILE_INC = 1
|
||||
|
||||
|
@ -35,6 +40,15 @@ NTTARGETFILE0=\
|
|||
!ELSEIF "$(_BUILDARCH)" == "x86"
|
||||
$(OBJ_PATH)\$(O)\..\i386\aesasm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
|
||||
!ELSEIF "$(_BUILDARCH)" == "arm64"
|
||||
$(OBJ_PATH)\$(O)\..\arm64\fdef_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\arm64\fdef369_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\arm64\wipe.asm \
|
||||
!IF "$(ARM64X_EC_ENABLED)" == "1"
|
||||
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef369_asm.asm \
|
||||
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\wipe.asm \
|
||||
!ENDIF
|
||||
!ENDIF
|
||||
|
||||
INCLUDES= \
|
||||
|
|
|
@ -9,6 +9,10 @@
|
|||
|
||||
#if defined(SYMCRYPT_CPU_AMD64)
|
||||
include ksamd64.inc
|
||||
#elif defined(SYMCRYPT_CPU_ARM64)
|
||||
#include "ksarm64.h"
|
||||
#include "arm64/symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_version.inc"
|
||||
#endif
|
||||
|
||||
#include "C_asm_shared.inc"
|
||||
|
@ -17,17 +21,29 @@ include ksamd64.inc
|
|||
#define ALIGN(__alignment) align __alignment
|
||||
#define GET_SYMBOL_ADDRESS(__symbol) __symbol
|
||||
#define HEX(__constant) __constant##h
|
||||
#define TEXTAREA() TEXTAREA
|
||||
#define EXTERN(__label) EXTERN __label
|
||||
#define LABEL(__labelname) __labelname
|
||||
|
||||
#elif defined(SYMCRYPT_GAS)
|
||||
|
||||
#if defined(SYMCRYPT_CPU_AMD64)
|
||||
.intel_syntax noprefix
|
||||
#elif defined(SYMCRYPT_CPU_ARM64)
|
||||
#include "arm64/symcrypt_name_mangling.inc"
|
||||
#include "symcrypt_version.inc"
|
||||
#endif
|
||||
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
|
||||
#define FILE_END()
|
||||
#define ALIGN(__alignment) .align __alignment
|
||||
#define GET_SYMBOL_ADDRESS(__symbol) __symbol+rip
|
||||
#define HEX(__constant) 0x##__constant
|
||||
#define TEXTAREA()
|
||||
#define EXTERN(__label)
|
||||
#define LABEL(__labelname) __labelname:
|
||||
|
||||
#else
|
||||
|
||||
|
|
|
@ -5,20 +5,26 @@ environments without requiring forking or duplication of source files - symcrypt
|
|||
assembly in an assembler and environment agnostic way.
|
||||
|
||||
The current target assemblers are:
|
||||
MASM and GAS
|
||||
MASM, GAS, and armasm64 (Arm64 assembler which ships with MSVC)
|
||||
The current target environments are:
|
||||
amd64 Windows (using the Microsoft x64 calling convention), and
|
||||
amd64 Linux (using the SystemV amd64 calling convention)
|
||||
amd64 Windows (using the Microsoft x64 calling convention),
|
||||
amd64 Linux (using the SystemV amd64 calling convention),
|
||||
arm64 Windows (using the aapcs64 calling convention),
|
||||
arm64 Windows (using the arm64ec calling convention), and
|
||||
arm64 Linux (using the aapcs64 calling convention)
|
||||
|
||||
Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
|
||||
rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
|
||||
this effort.
|
||||
|
||||
The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
|
||||
symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
|
||||
The plan is to rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as
|
||||
appropriate to enable this effort.
|
||||
|
||||
Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
|
||||
this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
|
||||
file.
|
||||
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
|
||||
substitutions, outputting a .asm file which can be assembled by the target assembler for the target
|
||||
environment.
|
||||
The exception is when using the armasm64 assembler, which uses the C preprocessor before assembling
|
||||
its inputs already; so the output of this script is directly assembled by armasm64.
|
||||
|
||||
We have set up the intermediate generated files to be created in the output directories in both
|
||||
razzle and CMake builds.
|
||||
|
@ -42,6 +48,7 @@ FUNCTION_START macro which currently takes 3 arguments:
|
|||
These arguments will be accessible in some contiguous region of the symcrypt registers at the
|
||||
start of the function
|
||||
On amd64 this contiguous region is R1..R<arg_count>
|
||||
On arm64 this contiguous region is R0..R<arg_count-1>
|
||||
Note: arg_count need not correspond to the exact number of argument in the function declaration
|
||||
if the assembly does not use some tail of the arguments
|
||||
3) The number of registers (reg_count) that the function uses
|
||||
|
@ -58,6 +65,7 @@ At the function end an epilogue is generated with restores the non-volatile regi
|
|||
A nested function (a function which does call another function) is specified similarly, only using
|
||||
NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and aligns
|
||||
the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
|
||||
Nested functions are not currently supported for Arm64.
|
||||
|
||||
|
||||
A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
|
||||
|
@ -82,6 +90,15 @@ and QH. As rdx is used to pass arguments, its value is moved to another register
|
|||
prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
|
||||
We currently do not support nested mul functions, as we have none of them.
|
||||
|
||||
### arm64 ###
|
||||
We allow up to 23 registers to be addressed, with the names:
|
||||
X_0-X_22 (64-bit registers) and W_0-W_22 (32-bit registers)
|
||||
v0-v7 ASIMD registers may by used directly in assembly too, as in both arm64 calling conventions we
|
||||
currently support, these registers are volatile so do not need any special handling
|
||||
|
||||
X_0 is always the result register and the first argument passed to the function.
|
||||
X_1-X_7 are the arguments 2-8 passed to the function
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
@ -91,37 +108,71 @@ import logging
|
|||
class Register:
|
||||
"""A class to represent registers"""
|
||||
|
||||
def __init__(self, name64, name32, name16, name8):
|
||||
def __init__(self, name64, name32, name16=None, name8=None):
|
||||
self.name64 = name64
|
||||
self.name32 = name32
|
||||
self.name16 = name16
|
||||
self.name8 = name8
|
||||
|
||||
# amd64 registers
|
||||
REG_RAX = Register("rax", "eax", "ax", "al")
|
||||
REG_RBX = Register("rbx", "ebx", "bx", "bl")
|
||||
REG_RCX = Register("rcx", "ecx", "cx", "cl")
|
||||
REG_RDX = Register("rdx", "edx", "dx", "dl")
|
||||
REG_RSI = Register("rsi", "esi", "si", "sil")
|
||||
REG_RDI = Register("rdi", "edi", "di", "dil")
|
||||
REG_RSP = Register("rsp", "esp", "sp", "spl")
|
||||
REG_RBP = Register("rbp", "ebp", "bp", "bpl")
|
||||
REG_R8 = Register( "r8", "r8d", "r8w", "r8b")
|
||||
REG_R9 = Register( "r9", "r9d", "r9w", "r9b")
|
||||
REG_R10 = Register("r10", "r10d", "r10w", "r10b")
|
||||
REG_R11 = Register("r11", "r11d", "r11w", "r11b")
|
||||
REG_R12 = Register("r12", "r12d", "r12w", "r12b")
|
||||
REG_R13 = Register("r13", "r13d", "r13w", "r13b")
|
||||
REG_R14 = Register("r14", "r14d", "r14w", "r14b")
|
||||
REG_R15 = Register("r15", "r15d", "r15w", "r15b")
|
||||
AMD64_RAX = Register("rax", "eax", "ax", "al")
|
||||
AMD64_RBX = Register("rbx", "ebx", "bx", "bl")
|
||||
AMD64_RCX = Register("rcx", "ecx", "cx", "cl")
|
||||
AMD64_RDX = Register("rdx", "edx", "dx", "dl")
|
||||
AMD64_RSI = Register("rsi", "esi", "si", "sil")
|
||||
AMD64_RDI = Register("rdi", "edi", "di", "dil")
|
||||
AMD64_RSP = Register("rsp", "esp", "sp", "spl")
|
||||
AMD64_RBP = Register("rbp", "ebp", "bp", "bpl")
|
||||
AMD64_R8 = Register( "r8", "r8d", "r8w", "r8b")
|
||||
AMD64_R9 = Register( "r9", "r9d", "r9w", "r9b")
|
||||
AMD64_R10 = Register("r10", "r10d", "r10w", "r10b")
|
||||
AMD64_R11 = Register("r11", "r11d", "r11w", "r11b")
|
||||
AMD64_R12 = Register("r12", "r12d", "r12w", "r12b")
|
||||
AMD64_R13 = Register("r13", "r13d", "r13w", "r13b")
|
||||
AMD64_R14 = Register("r14", "r14d", "r14w", "r14b")
|
||||
AMD64_R15 = Register("r15", "r15d", "r15w", "r15b")
|
||||
|
||||
# arm64 registers
|
||||
ARM64_R0 = Register( "x0", "w0")
|
||||
ARM64_R1 = Register( "x1", "w1")
|
||||
ARM64_R2 = Register( "x2", "w2")
|
||||
ARM64_R3 = Register( "x3", "w3")
|
||||
ARM64_R4 = Register( "x4", "w4")
|
||||
ARM64_R5 = Register( "x5", "w5")
|
||||
ARM64_R6 = Register( "x6", "w6")
|
||||
ARM64_R7 = Register( "x7", "w7")
|
||||
ARM64_R8 = Register( "x8", "w8")
|
||||
ARM64_R9 = Register( "x9", "w9")
|
||||
ARM64_R10 = Register("x10", "w10")
|
||||
ARM64_R11 = Register("x11", "w11")
|
||||
ARM64_R12 = Register("x12", "w12")
|
||||
ARM64_R13 = Register("x13", "w13")
|
||||
ARM64_R14 = Register("x14", "w14")
|
||||
ARM64_R15 = Register("x15", "w15")
|
||||
ARM64_R16 = Register("x16", "w16")
|
||||
ARM64_R17 = Register("x17", "w17")
|
||||
ARM64_R18 = Register("x18", "w18")
|
||||
ARM64_R19 = Register("x19", "w19")
|
||||
ARM64_R20 = Register("x20", "w20")
|
||||
ARM64_R21 = Register("x21", "w21")
|
||||
ARM64_R22 = Register("x22", "w22")
|
||||
ARM64_R23 = Register("x23", "w23")
|
||||
ARM64_R24 = Register("x24", "w24")
|
||||
ARM64_R25 = Register("x25", "w25")
|
||||
ARM64_R26 = Register("x26", "w26")
|
||||
ARM64_R27 = Register("x27", "w27")
|
||||
ARM64_R28 = Register("x28", "w28")
|
||||
ARM64_R29 = Register("x29", "w29") # Frame Pointer
|
||||
ARM64_R30 = Register("x30", "w30") # Link Register
|
||||
|
||||
class CallingConvention:
|
||||
"""A class to represent calling conventions"""
|
||||
|
||||
def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
|
||||
def __init__(self, name, architecture, mapping, max_arguments, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
|
||||
self.name = name
|
||||
self.architecture = architecture
|
||||
self.mapping = mapping
|
||||
self.max_arguments = max_arguments
|
||||
self.argument_registers = argument_registers
|
||||
self.volatile_registers = volatile_registers
|
||||
self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
|
||||
|
@ -139,9 +190,9 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
|
|||
we refer to rdx using (Q|D|W|B)H.
|
||||
"""
|
||||
rdx_index = None
|
||||
return_mapping = { 'H': REG_RDX }
|
||||
return_mapping = { 'H': AMD64_RDX }
|
||||
for (index, register) in mapping.items():
|
||||
if register == REG_RDX:
|
||||
if register == AMD64_RDX:
|
||||
rdx_index = index
|
||||
break
|
||||
for (index, register) in mapping.items():
|
||||
|
@ -156,28 +207,23 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
|
|||
return_mapping[index-1] = register
|
||||
return return_mapping
|
||||
|
||||
# Calling convention constants
|
||||
|
||||
MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
|
||||
MAX_FUNCTION_REGISTER_COUNT = 15
|
||||
|
||||
# Microsoft x64 calling convention
|
||||
MAPPING_AMD64_MSFT = {
|
||||
0: REG_RAX, # Result register
|
||||
1: REG_RCX, # Argument 1 / volatile
|
||||
2: REG_RDX, # Argument 2 / volatile
|
||||
3: REG_R8, # Argument 3 / volatile
|
||||
4: REG_R9, # Argument 4 / volatile
|
||||
5: REG_R10, # volatile
|
||||
6: REG_R11, # volatile
|
||||
7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
|
||||
8: REG_RDI,
|
||||
9: REG_RBP,
|
||||
10:REG_RBX,
|
||||
11:REG_R12,
|
||||
12:REG_R13,
|
||||
13:REG_R14,
|
||||
14:REG_R15,
|
||||
0: AMD64_RAX, # Result register / volatile
|
||||
1: AMD64_RCX, # Argument 1 / volatile
|
||||
2: AMD64_RDX, # Argument 2 / volatile
|
||||
3: AMD64_R8, # Argument 3 / volatile
|
||||
4: AMD64_R9, # Argument 4 / volatile
|
||||
5: AMD64_R10, # volatile
|
||||
6: AMD64_R11, # volatile
|
||||
7: AMD64_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
|
||||
8: AMD64_RDI,
|
||||
9: AMD64_RBP,
|
||||
10:AMD64_RBX,
|
||||
11:AMD64_R12,
|
||||
12:AMD64_R13,
|
||||
13:AMD64_R14,
|
||||
14:AMD64_R15,
|
||||
# currently not mapping rsp
|
||||
}
|
||||
|
||||
|
@ -212,11 +258,11 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=Fal
|
|||
|
||||
prologue += mul_fixup
|
||||
|
||||
# put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
|
||||
# put additional arguments into Q5-Q6 (we do not support more than 6 arguments for now)
|
||||
# stack_offset to get the 5th argument is:
|
||||
# 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
|
||||
stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
|
||||
for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
|
||||
for i in range(self.argument_registers+1, arg_count+1):
|
||||
prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
|
||||
stack_offset += 8
|
||||
return prologue
|
||||
|
@ -247,7 +293,7 @@ def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
|
|||
def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
|
||||
# only support 4 memory slots for now (in shadow space)
|
||||
if(slot >= 4):
|
||||
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
exit(1)
|
||||
# 8B for return address + (8*#pushed registers in prologue)
|
||||
stack_offset = 8 + (8*(reg_count-self.volatile_registers))
|
||||
|
@ -259,32 +305,32 @@ def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
|
|||
return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
|
||||
|
||||
CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
|
||||
gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
|
||||
CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
|
||||
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
|
||||
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 6, 4, 6,
|
||||
gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
|
||||
CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
|
||||
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
|
||||
gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
|
||||
|
||||
# AMD64 System V calling convention
|
||||
MAPPING_AMD64_SYSTEMV = {
|
||||
0: REG_RAX, # Result register
|
||||
1: REG_RDI, # Argument 1 / volatile
|
||||
2: REG_RSI, # Argument 2 / volatile
|
||||
3: REG_RDX, # Argument 3 / volatile
|
||||
4: REG_RCX, # Argument 4 / volatile
|
||||
5: REG_R8, # Argument 5 / volatile
|
||||
6: REG_R9, # Argument 6 / volatile
|
||||
7: REG_R10, # volatile
|
||||
8: REG_R11, # volatile
|
||||
9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
|
||||
10:REG_RBP,
|
||||
11:REG_R12,
|
||||
12:REG_R13,
|
||||
13:REG_R14,
|
||||
14:REG_R15
|
||||
0: AMD64_RAX, # Result register / volatile
|
||||
1: AMD64_RDI, # Argument 1 / volatile
|
||||
2: AMD64_RSI, # Argument 2 / volatile
|
||||
3: AMD64_RDX, # Argument 3 / volatile
|
||||
4: AMD64_RCX, # Argument 4 / volatile
|
||||
5: AMD64_R8, # Argument 5 / volatile
|
||||
6: AMD64_R9, # Argument 6 / volatile
|
||||
7: AMD64_R10, # volatile
|
||||
8: AMD64_R11, # volatile
|
||||
9: AMD64_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
|
||||
10:AMD64_RBP,
|
||||
11:AMD64_R12,
|
||||
12:AMD64_R13,
|
||||
13:AMD64_R14,
|
||||
14:AMD64_R15
|
||||
# currently not mapping rsp
|
||||
}
|
||||
|
||||
|
@ -305,7 +351,7 @@ def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=
|
|||
|
||||
prologue += mul_fixup
|
||||
|
||||
# do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
|
||||
# do not support more than 6 arguments for now
|
||||
# # put additional arguments into Q7-Qn
|
||||
# # stack_offset to get the 7th argument is:
|
||||
# # 8B for return address
|
||||
|
@ -341,7 +387,7 @@ def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
|
|||
def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
|
||||
# only support 4 memory slots for now
|
||||
if(slot >= 4):
|
||||
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
|
||||
exit(1)
|
||||
# For leaf functions, use the top of the redzone below the stack pointer
|
||||
offset = -8 * (slot+1)
|
||||
|
@ -354,58 +400,230 @@ def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count
|
|||
return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
|
||||
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
|
||||
gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
|
||||
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
|
||||
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 6, 8,
|
||||
gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
|
||||
CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
|
||||
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
|
||||
gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
|
||||
|
||||
|
||||
def gen_function_start_defines(mapping, arg_count, reg_count):
|
||||
# ARM64 calling conventions
|
||||
MAPPING_ARM64_AAPCS64 = {
|
||||
0: ARM64_R0, # Argument 1 / Result register / volatile
|
||||
1: ARM64_R1, # Argument 2 / volatile
|
||||
2: ARM64_R2, # Argument 3 / volatile
|
||||
3: ARM64_R3, # Argument 4 / volatile
|
||||
4: ARM64_R4, # Argument 5 / volatile
|
||||
5: ARM64_R5, # Argument 6 / volatile
|
||||
6: ARM64_R6, # Argument 7 / volatile
|
||||
7: ARM64_R7, # Argument 8 / volatile
|
||||
8: ARM64_R8, # Indirect result location / volatile
|
||||
9: ARM64_R9, # volatile
|
||||
10:ARM64_R10, # volatile
|
||||
11:ARM64_R11, # volatile
|
||||
12:ARM64_R12, # volatile
|
||||
13:ARM64_R13, # volatile
|
||||
14:ARM64_R14, # volatile
|
||||
15:ARM64_R15, # volatile
|
||||
# R16 and R17 are intra-procedure-call temporary registers which may be used by the linker
|
||||
# We cannot use these registers for local scratch if we call out to arbitrary procedures, but
|
||||
# currently we only have leaf functions in Arm64 symcryptasm.
|
||||
16:ARM64_R16, # IP0 / volatile
|
||||
17:ARM64_R17, # IP1 / volatile
|
||||
# R18 is a platform register which has a special meaning in kernel mode - we do not use it
|
||||
18:ARM64_R19, # non-volatile
|
||||
19:ARM64_R20, # non-volatile
|
||||
20:ARM64_R21, # non-volatile
|
||||
21:ARM64_R22, # non-volatile
|
||||
22:ARM64_R23, # non-volatile
|
||||
# We could map more registers (R24-R28) but we can only support 23 registers for ARM64EC, and we
|
||||
# don't use this many registers in any symcryptasm yet
|
||||
}
|
||||
|
||||
MAPPING_ARM64_ARM64ECMSFT = {
|
||||
0: ARM64_R0, # Argument 1 / Result register / volatile
|
||||
1: ARM64_R1, # Argument 2 / volatile
|
||||
2: ARM64_R2, # Argument 3 / volatile
|
||||
3: ARM64_R3, # Argument 4 / volatile
|
||||
4: ARM64_R4, # Argument 5 / volatile
|
||||
5: ARM64_R5, # Argument 6 / volatile
|
||||
6: ARM64_R6, # Argument 7 / volatile
|
||||
7: ARM64_R7, # Argument 8 / volatile
|
||||
8: ARM64_R8, # Indirect result location / volatile
|
||||
9: ARM64_R9, # volatile
|
||||
10:ARM64_R10, # volatile
|
||||
11:ARM64_R11, # volatile
|
||||
12:ARM64_R12, # volatile
|
||||
# R13 and R14 are reserved in ARM64EC
|
||||
13:ARM64_R15, # volatile
|
||||
14:ARM64_R16, # volatile
|
||||
15:ARM64_R17, # volatile
|
||||
16:ARM64_R19, # non-volatile
|
||||
17:ARM64_R20, # non-volatile
|
||||
18:ARM64_R21, # non-volatile
|
||||
19:ARM64_R22, # non-volatile
|
||||
# R23 and R24 are reserved in ARM64EC
|
||||
20:ARM64_R25, # non-volatile
|
||||
21:ARM64_R26, # non-volatile
|
||||
22:ARM64_R27, # non-volatile
|
||||
# R28 is reserved in ARM64EC
|
||||
}
|
||||
|
||||
def gen_prologue_aapcs64(self, arg_count, reg_count):
|
||||
prologue = ""
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
|
||||
exit(1)
|
||||
|
||||
return prologue
|
||||
|
||||
def gen_epilogue_aapcs64(self, arg_count, reg_count):
|
||||
epilogue = ""
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
|
||||
exit(1)
|
||||
|
||||
epilogue += " ret\n"
|
||||
|
||||
return epilogue
|
||||
|
||||
def gen_prologue_arm64ec(self, arg_count, reg_count):
|
||||
prologue = ""
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
# Calculate required stack space
|
||||
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
|
||||
registers_to_spill = 2 + reg_count - self.volatile_registers
|
||||
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
|
||||
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
|
||||
prologue += " PROLOG_SAVE_REG_PAIR fp, lr, #-%d! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
|
||||
|
||||
stack_offset = 16
|
||||
for i in range(self.volatile_registers, reg_count-1, 2):
|
||||
prologue += " PROLOG_SAVE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
|
||||
stack_offset += 16
|
||||
if registers_to_spill % 2 == 1:
|
||||
prologue += " PROLOG_SAVE_REG X_%d, #%d\n" % (reg_count-1, stack_offset)
|
||||
|
||||
return prologue
|
||||
|
||||
def gen_epilogue_arm64ec(self, arg_count, reg_count):
|
||||
epilogue = ""
|
||||
|
||||
if reg_count > self.volatile_registers:
|
||||
# Calculate required stack space
|
||||
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
|
||||
registers_to_spill = 2 + reg_count - self.volatile_registers
|
||||
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
|
||||
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
|
||||
|
||||
stack_offset = required_stack_space-16
|
||||
if registers_to_spill % 2 == 1:
|
||||
epilogue += " EPILOG_RESTORE_REG X_%d, #%d\n" % (reg_count-1, stack_offset)
|
||||
stack_offset -= 16
|
||||
for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
|
||||
epilogue += " EPILOG_RESTORE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
|
||||
stack_offset -= 16
|
||||
epilogue += " EPILOG_RESTORE_REG_PAIR fp, lr, #%d! // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
|
||||
epilogue += " EPILOG_RETURN\n"
|
||||
else:
|
||||
epilogue += " ret\n"
|
||||
|
||||
return epilogue
|
||||
|
||||
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, nested=False):
|
||||
logging.error("symcryptasm currently does not support memory slots for arm64!")
|
||||
exit(1)
|
||||
|
||||
CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
|
||||
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
|
||||
gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
|
||||
|
||||
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
|
||||
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
|
||||
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
|
||||
|
||||
def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
|
||||
defines = ""
|
||||
if architecture == "amd64":
|
||||
prefix64 = "Q"
|
||||
prefix32 = "D"
|
||||
prefix16 = "W"
|
||||
prefix8 = "B"
|
||||
elif architecture == "arm64":
|
||||
prefix64 = "X_"
|
||||
prefix32 = "W_"
|
||||
else:
|
||||
logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
|
||||
exit(1)
|
||||
|
||||
for (index, reg) in mapping.items():
|
||||
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
|
||||
continue
|
||||
defines += "#define Q%s %s\n" % (index, reg.name64)
|
||||
defines += "#define D%s %s\n" % (index, reg.name32)
|
||||
defines += "#define W%s %s\n" % (index, reg.name16)
|
||||
defines += "#define B%s %s\n" % (index, reg.name8)
|
||||
if start:
|
||||
if (reg.name64 is not None):
|
||||
defines += "#define %s%s %s\n" % (prefix64, index, reg.name64)
|
||||
if (reg.name32 is not None):
|
||||
defines += "#define %s%s %s\n" % (prefix32, index, reg.name32)
|
||||
if (reg.name16 is not None):
|
||||
defines += "#define %s%s %s\n" % (prefix16, index, reg.name16)
|
||||
if (reg.name8 is not None):
|
||||
defines += "#define %s%s %s\n" % (prefix8, index, reg.name8)
|
||||
else:
|
||||
if (reg.name64 is not None):
|
||||
defines += "#undef %s%s\n" % (prefix64, index)
|
||||
if (reg.name32 is not None):
|
||||
defines += "#undef %s%s\n" % (prefix32, index)
|
||||
if (reg.name16 is not None):
|
||||
defines += "#undef %s%s\n" % (prefix16, index)
|
||||
if (reg.name8 is not None):
|
||||
defines += "#undef %s%s\n" % (prefix8, index)
|
||||
return defines
|
||||
|
||||
def gen_function_end_defines(mapping, arg_count, reg_count):
|
||||
undefs = ""
|
||||
for (index, _) in mapping.items():
|
||||
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
|
||||
continue
|
||||
undefs += "#undef Q%s\n" % (index)
|
||||
undefs += "#undef D%s\n" % (index)
|
||||
undefs += "#undef W%s\n" % (index)
|
||||
undefs += "#undef B%s\n" % (index)
|
||||
return undefs
|
||||
def gen_function_start_defines(architecture, mapping, arg_count, reg_count):
|
||||
return gen_function_defines(architecture, mapping, arg_count, reg_count, start=True)
|
||||
|
||||
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n"
|
||||
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n"
|
||||
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n"
|
||||
MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n"
|
||||
def gen_function_end_defines(architecture, mapping, arg_count, reg_count):
|
||||
return gen_function_defines(architecture, mapping, arg_count, reg_count, start=False)
|
||||
|
||||
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s"
|
||||
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s"
|
||||
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s"
|
||||
MASM_FRAME_FUNCTION_END = "NESTED_END %s"
|
||||
|
||||
# MASM function macros takes the text area as an argument
|
||||
MASM_FUNCTION_TEMPLATE = "%s, _TEXT\n"
|
||||
# ARMASM64 function macros must be correctly indented
|
||||
ARMASM64_FUNCTION_TEMPLATE = " %s\n"
|
||||
|
||||
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
|
||||
GAS_FUNCTION_END = ""
|
||||
|
||||
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
|
||||
function_entry = None
|
||||
if assembler == "masm":
|
||||
# need to identify and mark up frame functions in masm
|
||||
if assembler in ["masm", "armasm64"]:
|
||||
# need to identify and mark up frame functions in masm and armasm64
|
||||
if nested or (reg_count > calling_convention.volatile_registers):
|
||||
function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
|
||||
else:
|
||||
function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
|
||||
|
||||
if assembler == "masm":
|
||||
function_entry = MASM_FUNCTION_TEMPLATE % function_entry
|
||||
elif assembler == "armasm64":
|
||||
function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
|
||||
elif assembler == "gas":
|
||||
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
|
||||
exit(1)
|
||||
|
||||
prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
|
||||
prologue = gen_function_start_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)
|
||||
prologue += "%s" % (function_entry)
|
||||
prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
|
||||
|
||||
|
@ -413,31 +631,41 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r
|
|||
|
||||
def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
|
||||
function_end = None
|
||||
if assembler == "masm":
|
||||
if assembler in ["masm", "armasm64"]:
|
||||
# need to identify and mark up frame functions in masm
|
||||
if nested or (reg_count > calling_convention.volatile_registers):
|
||||
function_end = MASM_FRAME_FUNCTION_END % (function_name)
|
||||
else:
|
||||
function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
|
||||
|
||||
if assembler == "masm":
|
||||
function_end = MASM_FUNCTION_TEMPLATE % function_end
|
||||
elif assembler == "armasm64":
|
||||
function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
|
||||
elif assembler == "gas":
|
||||
function_end = GAS_FUNCTION_END
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
|
||||
exit(1)
|
||||
|
||||
epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
|
||||
epilogue += "%s" % (function_end)
|
||||
epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
|
||||
epilogue += gen_function_end_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)
|
||||
|
||||
return epilogue
|
||||
|
||||
MASM_MACRO_START = "%s MACRO %s\n"
|
||||
MASM_MACRO_END = "ENDM\n"
|
||||
ARMASM64_MACRO_START= " MACRO\n %s %s"
|
||||
ARMASM64_MACRO_END = " MEND\n"
|
||||
GAS_MACRO_START = ".macro %s %s\n"
|
||||
GAS_MACRO_END = ".endm\n"
|
||||
MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
|
||||
GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
|
||||
|
||||
|
||||
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
|
||||
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
|
||||
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
|
||||
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*\)")
|
||||
GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
|
||||
ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
|
||||
MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
|
||||
|
@ -499,29 +727,41 @@ class ProcessingStateMachine:
|
|||
self.arg_count = int(match.groups()[-2])
|
||||
self.reg_count = int(match.groups()[-1])
|
||||
|
||||
if self.is_nested_function and self.nested_calling_convention is None:
|
||||
logging.error(
|
||||
"symcryptasm nested functions are not currently supported with assembler (%s) and architecture (%s)!\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.assembler, self.normal_calling_convention.architecture, line, line_num))
|
||||
exit(1)
|
||||
if self.is_mul_function and self.mul_calling_convention is None:
|
||||
logging.error(
|
||||
"symcryptasm mul functions are not supported with assembler (%s) and architecture (%s)!\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.assembler, self.normal_calling_convention.architecture, line, line_num))
|
||||
exit(1)
|
||||
if self.is_nested_function and self.is_mul_function:
|
||||
logging.error(
|
||||
"Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
|
||||
"%s (line %d)"
|
||||
% (line, line_num))
|
||||
exit(1)
|
||||
if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
|
||||
if self.arg_count > self.normal_calling_convention.max_arguments:
|
||||
logging.error(
|
||||
"Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
|
||||
"Too many (%d) arguments for symcryptasm function - only %d arguments are supported by calling convention (%s)\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
|
||||
% (self.arg_count, self.normal_calling_convention.max_arguments, self.normal_calling_convention.name, match.group(0), line_num))
|
||||
exit(1)
|
||||
if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
|
||||
if self.reg_count > len(self.normal_calling_convention.mapping):
|
||||
logging.error(
|
||||
"Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
|
||||
"Too many (%d) registers required for symcryptasm function - only %d registers are mapped by calling convention (%s)\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
|
||||
% (self.reg_count, len(self.normal_calling_convention.mapping), self.normal_calling_convention.name, match.group(0), line_num))
|
||||
exit(1)
|
||||
if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
|
||||
if self.is_mul_function and self.reg_count > len(self.mul_calling_convention.mapping)-1:
|
||||
logging.error(
|
||||
"Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
|
||||
"Too many (%d) registers required for symcryptasm mul function - only %d registers are mapped by calling convention (%s)\n\t"
|
||||
"%s (line %d)"
|
||||
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
|
||||
% (self.reg_count, len(self.mul_calling_convention.mapping)-1, self.mul_calling_convention.name, match.group(0), line_num))
|
||||
exit(1)
|
||||
|
||||
logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
|
||||
|
@ -546,10 +786,18 @@ class ProcessingStateMachine:
|
|||
return MASM_MACRO_START % (self.macro_name, match.group(2))
|
||||
elif self.assembler == "gas":
|
||||
return GAS_MACRO_START % (self.macro_name, match.group(2))
|
||||
elif self.assembler == "armasm64":
|
||||
# In armasm64 we need to escape all macro arguments with $
|
||||
prefixed_args = ", $".join(self.macro_args)
|
||||
if prefixed_args:
|
||||
prefixed_args = "$" + prefixed_args
|
||||
return ARMASM64_MACRO_START % (self.macro_name, prefixed_args)
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in process_start_macro" % assembler)
|
||||
exit(1)
|
||||
|
||||
def process_function_line(self, line, line_num):
|
||||
# Currently in a function
|
||||
|
||||
match = ALTERNATE_ENTRY_PATTERN.match(line)
|
||||
if (match):
|
||||
if self.assembler == "masm":
|
||||
|
@ -562,12 +810,12 @@ class ProcessingStateMachine:
|
|||
# Check the end function has same prefix as previous start function
|
||||
if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
|
||||
(self.is_mul_function ^ (match.group(2) == "MUL_")):
|
||||
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
|
||||
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \
|
||||
% (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
|
||||
exit(1)
|
||||
# Check the end function pattern has the same label as the previous start function pattern
|
||||
if self.function_name != match.groups()[-1]:
|
||||
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
|
||||
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \
|
||||
% (self.function_name, self.function_start_line, match.groups()[-1], line_num))
|
||||
exit(1)
|
||||
|
||||
|
@ -613,8 +861,18 @@ class ProcessingStateMachine:
|
|||
return MASM_MACRO_END
|
||||
elif self.assembler == "gas":
|
||||
return GAS_MACRO_END
|
||||
elif self.assembler == "armasm64":
|
||||
return ARMASM64_MACRO_END
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in process_macro_line" % self.assembler)
|
||||
exit(1)
|
||||
|
||||
if self.assembler == "gas":
|
||||
|
||||
if self.assembler == "armasm64":
|
||||
# In armasm64 macros we need to escape all of the macro arguments with a $ in the macro body
|
||||
for arg in self.macro_args:
|
||||
line = re.sub(arg, "$%s" % arg, line)
|
||||
elif self.assembler == "gas":
|
||||
# In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
|
||||
for arg in self.macro_args:
|
||||
line = re.sub(arg, r"\\%s" % arg, line)
|
||||
|
@ -622,18 +880,40 @@ class ProcessingStateMachine:
|
|||
# Not modifying the line any further
|
||||
return line
|
||||
|
||||
def process_file(target, infilename, outfilename):
|
||||
assembler = None
|
||||
if target == "masm":
|
||||
assembler = "masm"
|
||||
def process_file(assembler, architecture, calling_convention, infilename, outfilename):
|
||||
normal_calling_convention = None
|
||||
|
||||
if assembler == "masm":
|
||||
if architecture == "amd64" and calling_convention == "msft":
|
||||
normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
|
||||
mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
|
||||
nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
|
||||
elif target == "gas":
|
||||
assembler = "gas"
|
||||
elif assembler == "gas":
|
||||
if architecture == "amd64" and calling_convention == "systemv":
|
||||
normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
|
||||
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
|
||||
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
|
||||
elif architecture == "arm64" and calling_convention == "aapcs64":
|
||||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
||||
mul_calling_convention = None
|
||||
nested_calling_convention = None
|
||||
elif assembler == "armasm64":
|
||||
if architecture == "arm64" and calling_convention == "aapcs64":
|
||||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
||||
mul_calling_convention = None
|
||||
nested_calling_convention = None
|
||||
elif architecture == "arm64" and calling_convention == "arm64ec":
|
||||
normal_calling_convention = CALLING_CONVENTION_ARM64EC_MSFT
|
||||
mul_calling_convention = None
|
||||
nested_calling_convention = None
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in process_file" % assembler)
|
||||
exit(1)
|
||||
|
||||
if normal_calling_convention is None:
|
||||
logging.error("Unhandled combination (%s + %s + %s) in process_file"
|
||||
% (assembler, architecture, calling_convention))
|
||||
exit(1)
|
||||
|
||||
# iterate through file line by line in one pass
|
||||
file_processing_state = ProcessingStateMachine(
|
||||
|
@ -649,9 +929,11 @@ if __name__ == "__main__":
|
|||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
|
||||
parser.add_argument('target', type=str, help='Target that we want to preprocess for')
|
||||
parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
|
||||
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
|
||||
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
|
||||
parser.add_argument('inputfile', type=str, help='Path to input file')
|
||||
parser.add_argument('outputfile', type=str, help='Path to output file')
|
||||
|
||||
args = parser.parse_args()
|
||||
process_file(args.target, args.inputfile, args.outputfile)
|
||||
process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)
|
||||
|
|
Загрузка…
Ссылка в новой задаче