Merged PR 6438924: Enable SymCryptAsm for Arm64

+ Extends SymCryptAsm format and script to work in the Arm64 context
  + Now specify architecture, assembler, and calling convention in script invocation
+ Make various changes to assembly to remove redundant instructions, and generally
 slightly improve perf for all platforms (a couple of % here and there)
+ Use assembly routines in Linux builds and remove asmstubs file
+ Do not enable Windows Arm64 build with CMake yet

Related work items: #35613721
This commit is contained in:
Samuel Lee 2021-09-20 08:25:04 +00:00
Родитель c5ef94321c
Коммит 2bc541799d
24 изменённых файлов: 1703 добавлений и 1603 удалений

Просмотреть файл

@ -61,6 +61,9 @@ if(WIN32)
else()
if(NOT SYMCRYPT_TARGET_ENV MATCHES "Generic")
enable_language(ASM)
# Suppress noisy warnings about compile options which are ignored for ASM
# Less messy than restricting most of the below options to only C/CXX!
add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-Wno-unused-command-line-argument>)
endif()
# add_compile_options(-Wall)
# add_compile_options(-Wno-unknown-pragmas)
@ -76,6 +79,12 @@ else()
# Avoids error: cast from pointer to smaller type 'uintptr_t' when including <memory> from aarch64-linux-gnu
add_compile_options(-fms-extensions)
# GCC and clang unroll more aggressively than they should for best performance
# When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
# (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
# using GCC-specific pragmas for the loops of interest)
add_compile_options(-fno-unroll-loops)
# In Sanitize version, enable sanitizers
if (CMAKE_BUILD_TYPE MATCHES Sanitize)
add_compile_options(-fsanitize=address)
@ -120,12 +129,6 @@ else()
add_link_options(-fsanitize=vptr)
add_link_options(-fno-sanitize-recover=all)
endif()
# GCC and clang unroll more aggressively than they should for best performance
# When we want to unroll loops, we unroll in the source code, so tell the compiler not to unroll
# (clang seems to respect this option globally, but I could only make GCC behave in AES-GCM by
# using GCC-specific pragmas for the loops of interest)
add_compile_options(-fno-unroll-loops)
endif()
if(CMAKE_BUILD_TYPE MATCHES Release)

Просмотреть файл

@ -1,5 +1,5 @@
# This toolchain file configures CMake options for Linux User Mode AMD64 compilation with CPU optimizations.
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-AMD64.cmake
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-AMD64.cmake"
# Set CMake variables that subsequent CMake scripts can check against
set(CMAKE_SYSTEM_NAME Linux)

Просмотреть файл

@ -1,5 +1,5 @@
# This toolchain file configures CMake options for Linux User Mode ARM64 compilation with CPU optimizations.
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/LinuxUserMode-ARM64.cmake
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/LinuxUserMode-ARM64.cmake"
# Set CMake variables that subsequent CMake scripts can check against
set(CMAKE_SYSTEM_NAME Linux)
@ -8,13 +8,14 @@ set(CMAKE_SYSTEM_PROCESSOR ARM64)
set(TARGET_TRIPLE aarch64-linux-gnu)
# Currently only use clang as it makes cross-compilation easier
set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
# Point clang sysroot to cross compilation toolchain when cross compiling
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR EQUAL CMAKE_SYSTEM_PROCESSOR)
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES ARM64|aarch64)
# C/C++ toolchain (installed on Ubuntu using apt-get gcc-aarch64-linux-gnu g++-aarch64-linux-gnu)
set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})

Просмотреть файл

@ -1,5 +1,5 @@
# This toolchain file configures CMake options for Windows User Mode AMD64 compilation with CPU optimizations.
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-AMD64.cmake
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-AMD64.cmake"
# Set CMake variables that subsequent CMake scripts can check against
set(CMAKE_SYSTEM_NAME Windows)

Просмотреть файл

@ -1,5 +1,5 @@
# This toolchain file configures CMake options for Windows User Mode x86 compilation with CPU optimizations.
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE=cmake-toolchain/WindowsUserMode-X86.cmake -A Win32
# To use the toolchain file, run cmake .. -DCMAKE_TOOLCHAIN_FILE="cmake-toolchain/WindowsUserMode-X86.cmake" -A Win32
#
# (The "-A Win32" option seems to be required when compiling on a 64-bit host. Ideally this toolchain file
# should set all the required options, but I haven't figured out how to force 32-bit compilation from the

Просмотреть файл

@ -105,7 +105,7 @@ function(process_cppasm filepath outformat archdefine)
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
endif()
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86))
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
@ -151,14 +151,20 @@ function(process_cppasm filepath outformat archdefine)
endif()
endfunction()
function(process_symcryptasm filepath outformat archdefine)
function(process_symcryptasm filepath outformat archdefine callingconvention)
get_filename_component(fileextension ${filepath} EXT)
if(NOT fileextension STREQUAL .symcryptasm)
message(FATAL_ERROR "symcryptasm processing invoked on file with incorrect extension (${filepath} -> ${fileextension})")
endif()
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
endif()
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
endif()
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
set(filepath ${CMAKE_CURRENT_SOURCE_DIR}/${filepath})
@ -168,7 +174,7 @@ function(process_symcryptasm filepath outformat archdefine)
add_custom_command(
OUTPUT ${output_cppasm}
COMMAND ${CMAKE_COMMAND} -E make_directory ${output_directory}
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${filepath} ${output_cppasm}
COMMAND python3 ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py ${outformat} ${archdefine} ${callingconvention} ${filepath} ${output_cppasm}
MAIN_DEPENDENCY ${filepath}
DEPENDS ${CMAKE_SOURCE_DIR}/scripts/symcryptasm_processor.py
COMMENT "Python preprocessing ${filepath} to ${outformat} (${output_cppasm})"
@ -183,19 +189,15 @@ else()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
list(APPEND SOURCES_COMMON linux/intrinsics.c)
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
list(APPEND SOURCES_COMMON linux/asmstubs.c)
endif()
endif()
if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64)
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64)
process_symcryptasm(amd64/wipe.symcryptasm masm amd64)
process_symcryptasm(amd64/aesasm.symcryptasm masm amd64 msft)
process_symcryptasm(amd64/fdef_asm.symcryptasm masm amd64 msft)
process_symcryptasm(amd64/fdef369_asm.symcryptasm masm amd64 msft)
process_symcryptasm(amd64/fdef_mulx.symcryptasm masm amd64 msft)
process_symcryptasm(amd64/wipe.symcryptasm masm amd64 msft)
list(APPEND SOURCES_COMMON
amd64/aesasm-masm.asm
@ -229,11 +231,11 @@ if(WIN32 AND NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
endif()
elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64)
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64)
process_symcryptasm(amd64/wipe.symcryptasm gas amd64)
process_symcryptasm(amd64/aesasm.symcryptasm gas amd64 systemv)
process_symcryptasm(amd64/fdef_asm.symcryptasm gas amd64 systemv)
process_symcryptasm(amd64/fdef369_asm.symcryptasm gas amd64 systemv)
process_symcryptasm(amd64/fdef_mulx.symcryptasm gas amd64 systemv)
process_symcryptasm(amd64/wipe.symcryptasm gas amd64 systemv)
list(APPEND SOURCES_COMMON
amd64/aesasm-gas.asm
@ -248,6 +250,20 @@ elseif(NOT(SYMCRYPT_TARGET_ENV MATCHES "Generic"))
amd64/fdef_mulx-gas.asm
amd64/wipe-gas.asm
PROPERTY LANGUAGE ASM)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ARM64")
process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
list(APPEND SOURCES_COMMON
arm64/fdef_asm-gas.asm
arm64/fdef369_asm-gas.asm
arm64/wipe-gas.asm)
set_source_files_properties(
arm64/fdef_asm-gas.asm
arm64/fdef369_asm-gas.asm
arm64/wipe-gas.asm
PROPERTY LANGUAGE ASM)
endif()
endif()

Просмотреть файл

@ -31,7 +31,7 @@ const SYMCRYPT_MODULAR_FUNCTIONS g_SymCryptModFns[] = {
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY1024, // Special faster code for 1024-bit Montgomery moduli
SYMCRYPT_MOD_FUNCTIONS_FDEF_MONTGOMERY_MULX1024, // Special faster code for 1024-bit Montgomery moduli, MULX-based code
#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
#elif SYMCRYPT_CPU_ARM64
SYMCRYPT_MOD_FUNCTIONS_FDEF369_MONTGOMERY,
{NULL,},
@ -68,7 +68,7 @@ const SYMCRYPT_MODULUS_TYPE_SELECTION_ENTRY SymCryptModulusTypeSelections[] =
{('xM' << 16) + SymCryptModFntableMontgomeryMulx, SYMCRYPT_CPU_FEATURES_FOR_MULX, 0, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('1M' << 16) + SymCryptModFntableMontgomery1024, 0, 1024, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
#elif SYMCRYPT_CPU_ARM64 && SYMCRYPT_MS_VC
#elif SYMCRYPT_CPU_ARM64
{('mM' << 16) + SymCryptModFntableMontgomery, 0, 256, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },
{('9M' << 16) + SymCryptModFntable369Montgomery, 0, 384, SYMCRYPT_MODULUS_FEATURE_MONTGOMERY },

Просмотреть файл

@ -1,472 +0,0 @@
;
; fdef_369asm.asm Assembler code for large integer arithmetic in the default data format
;
; This file contains alternative routines that pretend that each digit is only 3 words.
; This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
; The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
;
; Most of this code is a direct copy of the default code.
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
#include "ksarm64.h"
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
#define SYMCRYPT_MASM
#include "C_asm_shared.inc"
#undef SYMCRYPT_MASM
#include "symcrypt_version.inc"
#include "symcrypt_name_mangling.inc"
#include "symcrypt_magic.inc"
; A digit consists of 3 words of 64 bits each
;UINT32
;SYMCRYPT_CALL
; SymCryptFdef369RawAdd(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
; UINT32 nDigits );
;
; Initial inputs to registers:
; pSrc1 -> x0
; pSrc2 -> x1
; pDst -> x2
; nDigits -> x3
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
neg x3, x3 ; negate the digit count
ands x4, x4, x4 ; Zero the carry flag
SymCryptFdef369RawAddAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; carry is in the carry flag
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
adcs x4, x4, x5
adcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldr x4, [x0], #8
ldr x5, [x1], #8
adcs x4, x4, x5
str x4, [x2], #8
cbnz x3, SymCryptFdef369RawAddAsmLoop
csetcs x0 ; Set the return value equal to the carry
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm)
;UINT32
;SYMCRYPT_CALL
;SymCryptFdef369RawSub(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
; UINT32 nDigits )
;
; Initial inputs to registers:
; pSrc1 -> x0
; pSrc2 -> x1
; pDst -> x2
; nDigits -> x3
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
neg x3, x3 ; negate the digit count
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
SymCryptFdef369RawSubAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; borrow is in the carry flag (flipped)
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldr x4, [x0], #8
ldr x5, [x1], #8
sbcs x4, x4, x5
str x4, [x2], #8
cbnz x3, SymCryptFdef369RawSubAsmLoop
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm)
;VOID
;SYMCRYPT_CALL
;SymCryptFdef369MaskedCopy(
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
; UINT32 nDigits,
; UINT32 mask )
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
neg x2, x2 ; negate the digit count
subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow)
SymCryptFdef369MaskedCopyAsmLoop
add x2, x2, #1 ; Increment the digit count by one
ldp x4, x6, [x0], #16 ; Load two words of the source
ldp x5, x7, [x1] ; Load two words of the destination
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
cselcc x6, x6, x7
stp x4, x6, [x1], #16 ; Store the two words in the destination
ldr x4, [x0], #8
ldr x5, [x1]
cselcc x4, x4, x5
str x4, [x1], #8
cbnz x2, SymCryptFdef369MaskedCopyAsmLoop
; Done, no return value
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm)
;VOID
;SYMCRYPT_CALL
;SymCryptFdef369RawMul(
; _In_reads_(nWords1) PCUINT32 pSrc1,
; UINT32 nDigits1,
; _In_reads_(nWords2) PCUINT32 pSrc2,
; UINT32 nDigits2,
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
;
; Initial inputs to registers:
; pSrc1 -> x0
; nDigits1 -> x1
; pSrc2 -> x2
; nDigits2 -> x3
; pDst -> x4
;
; Basic structure:
; for each word in Src1:
; Dst += Src2 * word
;
; Register assignments
; x0 = pSrc1 (moving forward one word every outer loop)
; x1 = negated word count of pSrc1
; x2 = pSrc2 (moving forward one *digit* every inner loop)
; x3 = negated digit count of pSrc2 and pDst
; x4 = pDst (moving forward one *digit* every inner loop)
; x5 = Stored pDst (moving forward one word every outer loop)
; x6 = Current word loaded from pSrc1
; x8, x9 = Current words loaded in pairs from pSrc2
; x10, x11 = Current words loaded in pairs from pDst
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
; x16 = Stored pSrc2
; x17 = Stored negated digit count of pSrc2
; Note x13, x14 are reserved in ARM64EC and thus are not used
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
add x1, x1, x1, LSL #1 ; Calculate word count (x1 * 3)
neg x1, x1 ; negate nWords1
neg x3, x3 ; negate nDigits2
mov x5, x4 ; store pDst
mov x16, x2 ; store pSrc2
mov x17, x3 ; store -nDigits2 for later
;
; First iteration of main loop (no adding of previous values from pDst)
;
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0] ; load the first word from pSrc1
SymCryptFdef369RawMulAsmLoopInner1
add x3, x3, #1 ; move one digit up
ldp x8, x9, [x2], #16 ; load 2 words from pSrc2
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
str x12, [x4], #8 ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
str x12, [x4], #8 ; Store to destination
ldr x8, [x2], #8
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
str x12, [x4], #8 ; Store to destination
cbnz x3, SymCryptFdef369RawMulAsmLoopInner1
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
str x15, [x4]
add x1, x1, #1 ; move one word up
add x0, x0, #8 ; move start of pSrc1 one word up
add x5, x5, #8 ; move start of pDst one word up
;
; MAIN LOOP
;
SymCryptFdef369RawMulAsmLoopOuter
mov x3, x17 ; set -nDigits2
mov x2, x16 ; set pSrc2
mov x4, x5 ; set pDst
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0] ; load the next word from pSrc1
SymCryptFdef369RawMulAsmLoopInner
add x3, x3, #1 ; move one digit up
ldp x8, x9, [x2], #16 ; load 2 words from pSrc2
ldp x10, x11, [x4] ; load 2 words from pDst
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4], #8 ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4], #8 ; Store to destination
ldr x8, [x2], #8
ldr x10, [x4]
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4], #8 ; Store to destination
cbnz x3, SymCryptFdef369RawMulAsmLoopInner
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
str x15, [x4]
adds x1, x1, #1 ; move one word up
add x0, x0, #8 ; move start of pSrc1 one word up
add x5, x5, #8 ; move start of pDst one word up
bne SymCryptFdef369RawMulAsmLoopOuter
; Done, no return value
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm)
;VOID
;SymCryptFdef369MontgomeryReduceAsm(
; _In_ PCSYMCRYPT_MODULUS pmMod,
; _In_ PUINT32 pSrc,
; _Out_ PUINT32 pDst )
;
; Initial inputs to registers:
; pmMod -> x0
; pSrc -> x1
; pDst -> x2
;
; Register assignments
; x0 = pMod (moving forward one *digit* every inner loop)
; x1 = pSrc (moving forward one *digit* every inner loop)
; x2 = pDst (used only in the end for subtract / result)
; x3 = negated digit count of pSrc and pMod
; x4 = negated word count of pSrc
; x5 = Inv64 of the modulus
; x6 = m = pSrc[i]*Inv64
; x7 = hc = high carry variable
; x8, x9 = Current words loaded in pairs from pSrc
; x10, x11 = Current words loaded in pairs from pMod
; x12, x15 = c variable = "128-bit" sliding register to hold the result of multiplies
; x16 = Temporary intermediate result
; x17 = Stored negated digit count of pSrc
; x19 = Stored pMod pointer
; x20 = Stored pSrc pointer (moving forward one word every outer loop)
; Note x13, x14 are reserved in ARM64EC and thus are not used
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR
PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20
ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits
ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus
add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod
add x4, x3, x3, LSL #1 ; Calculate word count (x3 * 3)
neg x3, x3 ; Negate the digit count
neg x4, x4 ; Negate the word count
mov x17, x3 ; Store the digit count for later
mov x19, x0 ; Store the pMod pointer
mov x20, x1 ; Store the pSrc pointer
ands x7, x7, XZR ; Set hc to 0
;
; Main loop
;
SymCryptFdef369MontgomeryReduceAsmOuter
ldr x8, [x1] ; Load 1 word from pSrc
mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m
ands x12, x12, XZR ; Set c to 0
ands x15, x15, XZR ; Set c to 0
SymCryptFdef369MontgomeryReduceAsmInner
ldp x10, x11, [x0], #16 ; pMod[j]
ldp x8, x9, [x1] ; pSrc[j]
mul x16, x6, x10 ; <63:0> of pMod[j]*m
adds x16, x16, x8 ; Adding pSrc[j]
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
adc x15, x15, XZR ; Add the carry if any (***)
adds x12, x12, x16 ; Add the lower bits of c
adc x15, x15, XZR ; Add the carry if any (***)
; ***: These cannot produce extra carry as the maximum is
; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
mov x12, x15 ; c >>= 64
mul x16, x6, x11 ; <63:0> of pMod[j]*m
adds x16, x16, x9 ; Adding pSrc[j]
umulh x15, x6, x11 ; <127:64> of pMod[j]*m
adc x15, x15, XZR ; Add the carry if any (***)
adds x12, x12, x16 ; Add the lower bits of c
adc x15, x15, XZR ; Add the carry if any (***)
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
mov x12, x15 ; c >>= 64
ldr x10, [x0], #8 ; pMod[j]
ldr x8, [x1] ; pSrc[j]
mul x16, x6, x10 ; <63:0> of pMod[j]*m
adds x16, x16, x8 ; Adding pSrc[j]
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
adc x15, x15, XZR ; Add the carry if any (***)
adds x12, x12, x16 ; Add the lower bits of c
adc x15, x15, XZR ; Add the carry if any (***)
str x12, [x1], #8 ; pSrc[j] = (UINT64) c
mov x12, x15 ; c >>= 64
adds x3, x3, #1 ; Move one digit up
bne SymCryptFdef369MontgomeryReduceAsmInner
ldr x8, [x1] ; pSrc[nWords]
adds x12, x12, x8 ; c + pSrc[nWords]
adc x15, XZR, XZR ; Add the carry if any
adds x12, x12, x7 ; c + pSrc[nWords] + hc
adc x7, x15, XZR ; Add the carry if any and store into hc
str x12, [x1] ; pSrc[nWords] = c
adds x4, x4, #1 ; Move one word up
add x20, x20, #8 ; Move stored pSrc pointer one word up
mov x0, x19 ; Restore pMod pointer
mov x1, x20 ; Restore pSrc pointer
mov x3, x17 ; Restore the digit counter
bne SymCryptFdef369MontgomeryReduceAsmOuter
;
; Subtraction
;
mov x16, x2 ; Store pDst pointer
; Prepare the pointers for subtract
mov x0, x20 ; pSrc
mov x1, x19 ; pMod
mov x10, x7 ; x10 = hc
mov x3, x17 ; Restore the digit counter
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
SymCryptFdef369MontgomeryReduceRawSubAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; borrow is in the carry flag (flipped)
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldr x4, [x0], #8
ldr x5, [x1], #8
sbcs x4, x4, x5
str x4, [x2], #8
cbnz x3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
orr x11, x10, x0 ; x11 = hc|d
; Prepare the pointers for masked copy
mov x0, x20 ; pSrc
mov x1, x16 ; pDst
mov x2, x17 ; Restore the digit counter
subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow)
SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
add x2, x2, #1 ; Increment the digit count by one
ldp x4, x6, [x0], #16 ; Load two words of the source
ldp x5, x7, [x1] ; Load two words of the destination
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
cselcc x6, x6, x7
stp x4, x6, [x1], #16 ; Store the two words in the destination
ldr x4, [x0], #8
ldr x5, [x1]
cselcc x4, x4, x5
str x4, [x1], #8
cbnz x2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
; Done, no return value
EPILOG_RESTORE_REG_PAIR x19, x20, #16
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
EPILOG_RETURN
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm)
END

Просмотреть файл

@ -0,0 +1,465 @@
//
// fdef369_asm.symcryptasm Assembler code for large integer arithmetic in the default data format
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// This file contains alternative routines that pretend that each digit is only 3 words.
// This gets used if the number is 1, 2, 3, 5, 6, or 9 digits long.
// The immediate advantage is that it improves EC performance on 192, 384, and 521-bit curves.
//
// Most of this code is a direct copy of the default code.
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#include "symcryptasm_shared.cppasm"
// A digit consists of 3 words of 64 bits each
//UINT32
//SYMCRYPT_CALL
//SymCryptFdef369RawAddAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
// UINT32 nDigits )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm), 4, 8)
ldp X_4, X_6, [X_0] // Load two words of pSrc1
ldp X_5, X_7, [X_1] // Load two words of pSrc2
adds X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2] // Store the result in the destination
ldr X_4, [X_0, #16] // Load one word of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldr X_5, [X_1, #16] // Load one word of pSrc2
adcs X_4, X_4, X_5
str X_4, [X_2, #16] // Store the result in the destination
cbz X_3, SymCryptFdef369RawAddAsmEnd
LABEL(SymCryptFdef369RawAddAsmLoop)
// carry is in the carry flag
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
adcs X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
ldr X_4, [X_0, #16] // Load one word of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldr X_5, [X_1, #16] // Load one word of pSrc2
adcs X_4, X_4, X_5
str X_4, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdef369RawAddAsmLoop
ALIGN(4)
LABEL(SymCryptFdef369RawAddAsmEnd)
cset X_0, cs // Set the return value equal to the carry
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawAddAsm))
//UINT32
//SYMCRYPT_CALL
//SymCryptFdef369RawSubAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
// UINT32 nDigits )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm), 4, 8)
ldp X_4, X_6, [X_0] // Load two words of pSrc1
ldp X_5, X_7, [X_1] // Load two words of pSrc2
subs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2] // Store the result in the destination
ldr X_4, [X_0, #16] // Load one word of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldr X_5, [X_1, #16] // Load one word of pSrc2
sbcs X_4, X_4, X_5
str X_4, [X_2, #16] // Store the result in the destination
cbz X_3, SymCryptFdef369RawSubAsmEnd
LABEL(SymCryptFdef369RawSubAsmLoop)
// borrow is in the carry flag (flipped)
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
ldr X_4, [X_0, #16] // Load one word of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldr X_5, [X_1, #16] // Load one word of pSrc2
sbcs X_4, X_4, X_5
str X_4, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdef369RawSubAsmLoop
ALIGN(4)
LABEL(SymCryptFdef369RawSubAsmEnd)
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawSubAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdef369MaskedCopyAsm(
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
// UINT32 nDigits,
// UINT32 mask )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm), 4, 7)
subs xzr, xzr, X_3 // If (X_3 > 0) clear the carry flag (i.e. borrow)
ldp X_3, X_5, [X_0] // Load two words of the source
ldp X_4, X_6, [X_1] // Load two words of the destination
csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand
csel X_5, X_5, X_6, cc
stp X_3, X_5, [X_1] // Store the two words in the destination
ldr X_3, [X_0, #16] // Load one word of the source
sub X_2, X_2, #1 // Decrement the digit count by one
ldr X_4, [X_1, #16] // Load one word of the destination
csel X_3, X_3, X_4, cc
str X_3, [X_1, #16] // Store the one word in the destination
cbz X_2, SymCryptFdef369MaskedCopyAsmEnd
LABEL(SymCryptFdef369MaskedCopyAsmLoop)
ldp X_3, X_5, [X_0, #24]! // Load two words of the source
ldp X_4, X_6, [X_1, #24]! // Load two words of the destination
csel X_3, X_3, X_4, cc // If the carry is clear, select the source operand
csel X_5, X_5, X_6, cc
stp X_3, X_5, [X_1] // Store the two words in the destination
ldr X_3, [X_0, #16] // Load one word of the source
sub X_2, X_2, #1 // Decrement the digit count by one
ldr X_4, [X_1, #16] // Load one word of the destination
csel X_3, X_3, X_4, cc
str X_3, [X_1, #16] // Store the one word in the destination
cbnz X_2, SymCryptFdef369MaskedCopyAsmLoop
LABEL(SymCryptFdef369MaskedCopyAsmEnd)
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MaskedCopyAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdef369RawMulAsm(
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
// UINT32 nDigits1,
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
// UINT32 nDigits2,
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
//
// Basic structure:
// for each word in Src1:
// Dst += Src2 * word
//
// Register assignments
// X_0 = pSrc1 (moving forward one word every outer loop)
// X_1 = word count of pSrc1
// X_2 = pSrc2 (moving forward one *digit* every inner loop)
// X_3 = digit count of pSrc2 and pDst
// X_4 = pDst (moving forward one *digit* every inner loop)
// X_5 = Stored pDst (moving forward one word every outer loop)
// X_6 = Current word loaded from pSrc1
// X_7, X_8 = Current words loaded in pairs from pSrc2
// X_9, X_10 = Current words loaded in pairs from pDst
// X_11, X_12 = Scratch registers for holding the results of multiplies
// X_13 = Stored pSrc2
// X_14 = Stored digit count of pSrc2
// X_15 = Scratch register for holding the results of multiplies
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm), 5, 16)
add X_1, X_1, X_1, LSL #1 // Calculate word count (X_1 * 3)
sub X_2, X_2, #24 // offset pSrc2 so we can use pre-increment form of loads
sub X_4, X_4, #24 // offset pDst so we can use pre-increment form of loads
mov X_5, X_4 // store pDst
mov X_13, X_2 // store pSrc2
mov X_14, X_3 // store nDigits2 for later
//
// First iteration of main loop (no adding of previous values from pDst)
//
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0] // load the first word from pSrc1
LABEL(SymCryptFdef369RawMulAsmLoopInner1)
sub X_3, X_3, #1 // move one digit up
ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j]
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j]
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
stp X_11, X_15, [X_4, #24]! // Store to destination
ldr X_7, [X_2, #16] // load 1 word from pSrc2
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
str X_11, [X_4, #16] // Store to destination
cbnz X_3, SymCryptFdef369RawMulAsmLoopInner1
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
str X_12, [X_4, #24]
sub X_1, X_1, #1 // move one word up
add X_0, X_0, #8 // move start of pSrc1 one word up
add X_5, X_5, #8 // move start of pDst one word up
//
// MAIN LOOP
//
LABEL(SymCryptFdef369RawMulAsmLoopOuter)
mov X_3, X_14 // set nDigits2
mov X_2, X_13 // set pSrc2
mov X_4, X_5 // set pDst
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0] // load the next word from pSrc1
LABEL(SymCryptFdef369RawMulAsmLoopInner)
sub X_3, X_3, #1 // move one digit up
ldp X_7, X_8, [X_2, #24]! // load 2 words from pSrc2
ldp X_9, X_10, [X_4, #24]! // load 2 words from pDst
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j]
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j]
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
stp X_9, X_10, [X_4] // Store to destination
ldr X_7, [X_2, #16] // load 1 word from pSrc2
ldr X_9, [X_4, #16] // load 1 word from pDst
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
str X_9, [X_4, #16] // Store to destination
cbnz X_3, SymCryptFdef369RawMulAsmLoopInner
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
str X_12, [X_4, #24]
subs X_1, X_1, #1 // move one word up
add X_0, X_0, #8 // move start of pSrc1 one word up
add X_5, X_5, #8 // move start of pDst one word up
bne SymCryptFdef369RawMulAsmLoopOuter
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369RawMulAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdef369MontgomeryReduceAsm(
// _In_ PCSYMCRYPT_MODULUS pmMod,
// _Inout_ PUINT32 pSrc,
// _Out_ PUINT32 pDst )
//
// Register assignments
// X_0 = pMod (moving forward one *digit* every inner loop)
// X_1 = pSrc (moving forward one *digit* every inner loop)
// X_2 = pDst (used only in the end for subtract / result)
// X_3 = digit count of pSrc and pMod
// X_4 = word count of pSrc
// X_5 = Inv64 of the modulus
// X_6 = m = pSrc[i]*Inv64
// X_7 = hc = high carry variable
// X_8, X_9 = Current words loaded in pairs from pSrc
// X_10, X_11 = Current words loaded in pairs from pMod
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
// X_14 = Temporary intermediate result
// X_15 = Stored digit count of pSrc
// X_16 = Stored pMod pointer
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm), 3, 18)
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
add X_4, X_3, X_3, LSL #1 // Calculate word count (X_3 * 3)
sub X_0, X_0, #24 // offset pMod so we can use pre-increment form of loads
sub X_1, X_1, #24 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #24 // offset pDst so we can use pre-increment form of loads
mov X_15, X_3 // Store the digit count for later
mov X_16, X_0 // Store the pMod pointer
mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0
//
// Main loop
//
LABEL(SymCryptFdef369MontgomeryReduceAsmOuter)
ldr X_8, [X_1, #24] // Load 1 word from pSrc
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0
LABEL(SymCryptFdef369MontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #24]! // pMod[j]
ldp X_8, X_9, [X_1, #24]! // pSrc[j]
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
// ***: These cannot produce extra carry as the maximum is
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str X_12, [X_1] // pSrc[j] = (UINT64) c4) c
mov X_12, X_13 // c >>= 64
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_13, X_6, X_11 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
str X_12, [X_1, #8] // pSrc[j] = (UINT64) c
mov X_12, X_13 // c >>= 64
ldr X_10, [X_0, #16] // pMod[j]
ldr X_8, [X_1, #16] // pSrc[j]
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c4) c
mov X_12, X_13 // c >>= 64
subs X_3, X_3, #1 // Move one digit up
bne SymCryptFdef369MontgomeryReduceAsmInner
ldr X_8, [X_1, #24] // pSrc[nWords]
adds X_12, X_12, X_8 // c + pSrc[nWords]
adc X_13, xzr, xzr // Add the carry if any
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
adc X_7, X_13, xzr // Add the carry if any and store into hc
str X_12, [X_1, #24] // pSrc[nWords] = c
subs X_4, X_4, #1 // Move one word up
add X_17, X_17, #8 // Move stored pSrc pointer one word up
mov X_0, X_16 // Restore pMod pointer
mov X_1, X_17 // Restore pSrc pointer
mov X_3, X_15 // Restore the digit counter
bne SymCryptFdef369MontgomeryReduceAsmOuter
//
// Subtraction
//
mov X_14, X_2 // Store pDst pointer
// Prepare the pointers for subtract
mov X_0, X_17 // pSrc
mov X_1, X_16 // pMod
mov X_10, X_7 // X_10 = hc
mov X_3, X_15 // Restore the digit counter
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
LABEL(SymCryptFdef369MontgomeryReduceRawSubAsmLoop)
sub X_3, X_3, #1 // Decrement the digit count by one
// borrow is in the carry flag (flipped)
ldp X_4, X_6, [X_0, #24]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #24]! // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #24]! // Store the result in the destination
ldr X_4, [X_0, #16] // Load one word of pSrc1
ldr X_5, [X_1, #16] // Load one word of pSrc2
sbcs X_4, X_4, X_5
str X_4, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdef369MontgomeryReduceRawSubAsmLoop
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
orr X_11, X_10, X_0 // X_11 = hc|d
// Prepare the pointers for masked copy
mov X_0, X_17 // pSrc
mov X_1, X_14 // pDst
mov X_2, X_15 // Restore the digit counter
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
LABEL(SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop)
sub X_2, X_2, #1 // decrement the digit count by one
ldp X_4, X_6, [X_0, #24]! // Load two words of the source
ldp X_5, X_7, [X_1, #24]! // Load two words of the destination
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
csel X_6, X_6, X_7, cc
stp X_4, X_6, [X_1] // Store the two words in the destination
ldr X_4, [X_0, #16] // Load one word of the source
ldr X_5, [X_1, #16] // Load one word of the destination
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
str X_4, [X_1, #16] // Store the one word in the destination
cbnz X_2, SymCryptFdef369MontgomeryReduceMaskedCopyAsmLoop
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdef369MontgomeryReduceAsm))
FILE_END()

Просмотреть файл

@ -1,768 +0,0 @@
;
; fdef_asm.asm Assembler code for large integer arithmetic in the default data format for the arm64 architecture
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
#include "ksarm64.h"
; As Arm assembler already uses C preprocessor, we can just hardcode this asm to include constants
; MASM for now. To be fixed properly when converting arm64 asm to symcryptasm.
#define SYMCRYPT_MASM
#include "C_asm_shared.inc"
#undef SYMCRYPT_MASM
#include "symcrypt_version.inc"
#include "symcrypt_name_mangling.inc"
#include "symcrypt_magic.inc"
; A digit consists of 4 words of 64 bits each
;UINT32
;SYMCRYPT_CALL
; SymCryptFdefRawAdd(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
; UINT32 nDigits );
;
; Initial inputs to registers:
; pSrc1 -> x0
; pSrc2 -> x1
; pDst -> x2
; nDigits -> x3
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
neg x3, x3 ; negate the digit count
ands x4, x4, x4 ; Zero the carry flag
SymCryptFdefRawAddAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; carry is in the carry flag
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
adcs x4, x4, x5
adcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
adcs x4, x4, x5
adcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
cbnz x3, SymCryptFdefRawAddAsmLoop
csetcs x0 ; Set the return value equal to the carry
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm)
;UINT32
;SYMCRYPT_CALL
;SymCryptFdefRawSub(
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
; _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
; _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
; UINT32 nDigits )
;
; Initial inputs to registers:
; pSrc1 -> x0
; pSrc2 -> x1
; pDst -> x2
; nDigits -> x3
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
neg x3, x3 ; negate the digit count
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
SymCryptFdefRawSubAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; borrow is in the carry flag (flipped)
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
cbnz x3, SymCryptFdefRawSubAsmLoop
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm)
;VOID
;SYMCRYPT_CALL
;SymCryptFdefMaskedCopy(
; _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
; _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
; UINT32 nDigits,
; UINT32 mask )
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
neg x2, x2 ; negate the digit count
subs x4, XZR, x3 ; If (x3 > 0) clear the carry flag (i.e. borrow)
SymCryptFdefMaskedCopyAsmLoop
add x2, x2, #1 ; Increment the digit count by one
ldp x4, x6, [x0], #16 ; Load two words of the source
ldp x5, x7, [x1] ; Load two words of the destination
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
cselcc x6, x6, x7
stp x4, x6, [x1], #16 ; Store the two words in the destination
ldp x4, x6, [x0], #16
ldp x5, x7, [x1]
cselcc x4, x4, x5
cselcc x6, x6, x7
stp x4, x6, [x1], #16
cbnz x2, SymCryptFdefMaskedCopyAsmLoop
; Done, no return value
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm)
;VOID
;SYMCRYPT_CALL
;SymCryptFdefRawMul(
; _In_reads_(nWords1) PCUINT32 pSrc1,
; UINT32 nDigits1,
; _In_reads_(nWords2) PCUINT32 pSrc2,
; UINT32 nDigits2,
; _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
;
; Initial inputs to registers:
; pSrc1 -> x0
; nDigits1 -> x1
; pSrc2 -> x2
; nDigits2 -> x3
; pDst -> x4
;
; Basic structure:
; for each word in Src1:
; Dst += Src2 * word
;
; Register assignments
; x0 = pSrc1 (moving forward one word every outer loop)
; x1 = negated word count of pSrc1
; x2 = pSrc2 (moving forward one *digit* every inner loop)
; x3 = negated digit count of pSrc2 and pDst
; x4 = pDst (moving forward one *digit* every inner loop)
; x5 = Stored pDst (moving forward one word every outer loop)
; x6 = Current word loaded from pSrc1
; x8, x9 = Current words loaded in pairs from pSrc2
; x10, x11 = Current words loaded in pairs from pDst
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
; x16 = Stored pSrc2
; x17 = Stored negated digit count of pSrc2
; Note x13, x14 are reserved in ARM64EC and thus are not used
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
lsl x1, x1, #2 ; Calculate word count
neg x1, x1 ; negate nWords1
neg x3, x3 ; negate nDigits2
mov x5, x4 ; store pDst
mov x16, x2 ; store pSrc2
mov x17, x3 ; store -nDigits2 for later
;
; First iteration of main loop (no adding of previous values from pDst)
;
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0] ; load the first word from pSrc1
SymCryptFdefRawMulAsmLoopInner1
add x3, x3, #1 ; move one digit up
ldp x8, x9, [x2] ; load 2 words from pSrc2
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j]
str x12, [x4] ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+1]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+1]
str x12, [x4, #8] ; Store to destination
ldp x8, x9, [x2, #16] ; load 2 words from pSrc2
mul x12, x6, x8 ; Bits <63:0> of pSrc1[0]*pSrc2[j+2]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[0]*pSrc2[j+2]
str x12, [x4, #16] ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[0]*pSrc2[j+3]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[0]*pSrc2[j+3]
str x12, [x4, #24] ; Store to destination
add x2, x2, #32
add x4, x4, #32
cbnz x3, SymCryptFdefRawMulAsmLoopInner1
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
str x15, [x4]
add x1, x1, #1 ; move one word up
add x0, x0, #8 ; move start of pSrc1 one word up
add x5, x5, #8 ; move start of pDst one word up
;
; MAIN LOOP
;
SymCryptFdefRawMulAsmLoopOuter
mov x3, x17 ; set -nDigits2
mov x2, x16 ; set pSrc2
mov x4, x5 ; set pDst
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0] ; load the next word from pSrc1
SymCryptFdefRawMulAsmLoopInner
add x3, x3, #1 ; move one digit up
ldp x8, x9, [x2] ; load 2 words from pSrc2
ldp x10, x11, [x4] ; load 2 words from pDst
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
; Note: this cannot overflow as the maximum for <x15:x12> is (2^64-1)(2^64-1)+(2^64-1)+1 = 2^128 - 2^64 + 1
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4] ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+1]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+1]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4, #8] ; Store to destination
ldp x8, x9, [x2, #16] ; load 2 words from pSrc2
ldp x10, x11, [x4, #16] ; load 2 words from pDst
mul x12, x6, x8 ; Bits <63:0> of pSrc1[i]*pSrc2[j+2]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x8 ; Bits <127:64> of pSrc1[i]*pSrc2[j+2]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
adds x12, x12, x10 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4, #16] ; Store to destination
mul x12, x6, x9 ; Bits <63:0> of pSrc1[i]*pSrc2[j+3]
adcs x12, x12, x15 ; Adding the previous word (if there was a carry from the last addition it is added)
umulh x15, x6, x9 ; Bits <127:64> of pSrc1[i]*pSrc2[j+3]
adc x15, x15, XZR ; Add the carry if any and don't update the flags
adds x12, x12, x11 ; add the word from the destination and update the flags (this can overflow)
str x12, [x4, #24] ; Store to destination
add x2, x2, #32
add x4, x4, #32
cbnz x3, SymCryptFdefRawMulAsmLoopInner
adc x15, x15, XZR ; Store the next word into the destination (with the carry if any)
str x15, [x4]
adds x1, x1, #1 ; move one word up
add x0, x0, #8 ; move start of pSrc1 one word up
add x5, x5, #8 ; move start of pDst one word up
bne SymCryptFdefRawMulAsmLoopOuter
; Done, no return value
ret
LEAF_END ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm)
; Macro for the first loop of the first pass of RawSquareAsm.
; It takes one word from the source, multiplies it with the mulword,
; adds the high level word of the previous macro call, and stores it into
; the destination.
;
; No carry flag is propagated from the previous macro call as the maximum is
; (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
MACRO
SQR_SINGLEADD_64 $index
ldr x8, [x2, #8*$index] ; pSrc[i+j]
mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
adds x12, x12, x15 ; Adding the previous word
umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
str x12, [x4, #8*$index] ; Store to destination
MEND
; Macro for the remaining loops of the first pass of RawSquareAsm.
; The only difference to the above is that it also adds the word loaded
; from the destination buffer.
;
; No carry flag is propagated from the previous macro call as the maximum is
; (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
MACRO
SQR_DOUBLEADD_64 $index
ldr x8, [x2, #8*$index] ; pSrc[i+j]
ldr x10, [x4, #8*$index] ; pDst[2*(i+j)]
mul x12, x6, x8 ; Bits <63:0> of pSrc[i]*pSrc[i+j]
adds x12, x12, x15 ; Adding the previous word
umulh x15, x6, x8 ; Bits <127:64> of pSrc[i]*pSrc[i+j]
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
adds x12, x12, x10 ; Add the word from the destination
adc x15, x15, XZR ; Add the intermediate carry and don't update the flags
str x12, [x4, #8*$index] ; Store to destination
MEND
; Macro for the third pass loop of RawSquareAsm.
; It takes one mulword from the source, squares it, and
; adds it to the even columns of the destination. The carries are propagated
; to the odd columns.
;
; Here we can have a (1-bit) carry to the next call because the maximum value for
; a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
MACRO
SQR_DIAGONAL_PROP $index
ldr x6, [x0, #8*$index] ; mulword
mul x12, x6, x6 ; Bits <63:0> of m^2
umulh x15, x6, x6 ; Bits <127:64> of m^2
ldp x8, x9, [x4, #16*$index] ; Load
; Adding the square to the even column
adcs x12, x12, x8 ; carry from previous and update the flags
; Propagating the sum to the next column
adcs x15, x15, x9 ; This can generate a carry
stp x12, x15, [x4, #16*$index] ; Store
MEND
; VOID
; SYMCRYPT_CALL
; SymCryptFdefRawSquareAsm(
; _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
; UINT32 nDigits,
; _Out_writes_(2*nWords) PUINT32 pDst )
;
; Initial inputs to registers:
; pSrc -> x0
; nDigits -> x1
; pDst -> x2
;
; Register assignments
; x0 = pSrc
; x1 = negated word count of pSrc
; x2 = pSrc (moving forward one digit / 4 words every inner loop)
; x3 = negated digit count of pSrc
; x4 = pDst (moving forward one digit every inner loop)
; x5 = pDst (moving forward one word every outer loop)
; x6 = Current word loaded from pSrc
; x8, x9 = Current words loaded in pairs from pSrc2
; x10, x11 = Current words loaded in pairs from pDst
; x12, x15 = "128-bit" sliding register to hold the result of multiplies
; x16 = Stored pSrc
; x17 = Negated digit count of pSrc
; x19 = Stored negated digit count of pSrc
; x20 = Stored pDst
; Note x13, x14 are reserved in ARM64EC and thus are not used
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
PROLOG_SAVE_REG_PAIR fp, lr, #-32! ; allocate 32 bytes of stack; store FP/LR
PROLOG_SAVE_REG_PAIR x19, x20, #16 ; free up x19/x20
mov x3, x1 ; digit count into x3
lsl x1, x1, #2 ; Calculate word count
neg x1, x1 ; negate nWords
neg x3, x3 ; negate nDigits
mov x4, x2 ; pDst
mov x5, x2 ; store pDst
mov x20, x2 ; store pDst
mov x16, x0 ; store pSrc
mov x2, x0 ; inner loop pSrc
mov x17, x3 ; store -nDigits for later
mov x19, x3 ; store -nDigits for later
;
; First iteration of main loop (no adding of previous values from pDst)
;
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0] ; load the first word from pSrc1
str x15, [x4] ; store 0 for the first word
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
SymCryptFdefRawSquareAsmInnerLoopInit_Word0
SQR_SINGLEADD_64 0
SymCryptFdefRawSquareAsmInnerLoopInit_Word1
SQR_SINGLEADD_64 1
SQR_SINGLEADD_64 2
SQR_SINGLEADD_64 3
add x3, x3, #1 ; move one digit up
add x2, x2, #32
add x4, x4, #32
cbnz x3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
str x15, [x4] ; Store the next word into the destination
add x1, x1, #1 ; move one word up
mov x9, #1 ; Cyclic counter
;
; MAIN LOOP
;
SymCryptFdefRawSquareAsmOuterLoop
add x5, x5, #8 ; move start of pDst one word up
mov x3, x17 ; set -nDigits
mov x2, x0 ; set pSrc
mov x4, x5 ; set pDst
ands x15, x15, XZR ; Clearing the carry flag and setting x15 = 0
ldr x6, [x0, x9, LSL #3] ; load the next word from pSrc
; Cyclic counter and jump logic
add x9, x9, #1
cmp x9, #1
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
cmp x9, #2
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
cmp x9, #3
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
; The following instructions are only executed when x9 == 4
mov x9, XZR ; Set it to 0
add x0, x0, #32 ; move start of pSrc 4 words up
add x5, x5, #32 ; move pDst 4 words up
mov x2, x0 ; set pSrc
mov x4, x5 ; set pDst
adds x17, x17, #1 ; add 1 digit
mov x3, x17 ; set the new digit counter
SymCryptFdefRawSquareAsmInnerLoop_Word0
SQR_DOUBLEADD_64 0
SymCryptFdefRawSquareAsmInnerLoop_Word1
SQR_DOUBLEADD_64 1
SymCryptFdefRawSquareAsmInnerLoop_Word2
SQR_DOUBLEADD_64 2
SymCryptFdefRawSquareAsmInnerLoop_Word3
SQR_DOUBLEADD_64 3
add x3, x3, #1 ; move one digit up
add x2, x2, #32
add x4, x4, #32
cbnz x3, SymCryptFdefRawSquareAsmInnerLoop_Word0
str x15, [x4] ; Store the next word into the destination
adds x1, x1, #1 ; move one word up
cmn x1, #1 ; Compare with -1
bne SymCryptFdefRawSquareAsmOuterLoop
ands x15, x15, XZR ; Setting x15 = 0
str x15, [x5, #40] ; Store 0 to destination for the top word
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Second Pass - Shifting all results 1 bit left
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov x3, x19 ; -nDigits
lsl x3, x3, #1 ; Double digits
mov x4, x20 ; pDst pointer
ands x8, x8, XZR ; Clear the flags
SymCryptFdefRawSquareAsmSecondPass
add x3, x3, #1 ; move one digit up
ldp x8, x9, [x4]
adcs x8, x8, x8 ; Shift left and add the carry
adcs x9, x9, x9
stp x8, x9, [x4], #16
ldp x10, x11, [x4]
adcs x10, x10, x10 ; Shift left and add the carry
adcs x11, x11, x11
stp x10, x11, [x4], #16
cbnz x3, SymCryptFdefRawSquareAsmSecondPass
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Third Pass - Adding the squares on the even columns and propagating the sum
; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
ands x8, x8, XZR ; Clear the flags
mov x0, x16 ; src pointer
mov x4, x20 ; pDst pointer
mov x3, x19 ; -nDigits
SymCryptFdefRawSquareAsmThirdPass
SQR_DIAGONAL_PROP 0
SQR_DIAGONAL_PROP 1
SQR_DIAGONAL_PROP 2
SQR_DIAGONAL_PROP 3
add x3, x3, #1 ; move one digit up
add x0, x0, #32 ; One digit up (not updated in SQR_DIAGONAL_PROP)
add x4, x4, #64 ; Two digits up (not updated in SQR_DIAGONAL_PROP)
cbnz x3, SymCryptFdefRawSquareAsmThirdPass
; Done, no return value
EPILOG_RESTORE_REG_PAIR x19, x20, #16
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
EPILOG_RETURN
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm)
;VOID
;SymCryptFdefMontgomeryReduceAsm(
; _In_ PCSYMCRYPT_MODULUS pmMod,
; _In_ PUINT32 pSrc,
; _Out_ PUINT32 pDst )
;
; Initial inputs to registers:
; pmMod -> x0
; pSrc -> x1
; pDst -> x2
;
; Register assignments
; x0 = pMod (moving forward one *digit* every inner loop)
; x1 = pSrc (moving forward one *digit* every inner loop)
; x2 = pDst (used only in the end for subtract / result)
; x3 = negated digit count of pSrc and pMod
; x4 = negated word count of pSrc
; x5 = Inv64 of the modulus
; x6 = m = pSrc[i]*Inv64
; x7 = hc = high carry variable
; x8, x9 = Current words loaded in pairs from pSrc
; x10, x11 = Current words loaded in pairs from pMod
; x12, x15 = c variable = "128-bit" register to hold the result of multiplies
; It is flipped between [x12:x15] and [x15:x12] intstead of doing c>>=64
; x16 = Temporary intermediate result
; x17 = Stored negated digit count of pSrc
; x19 = Stored pMod pointer
; x20 = Stored pSrc pointer (moving forward one word every outer loop)
; Note x13, x14 are reserved in ARM64EC and thus are not used
NESTED_ENTRY ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
PROLOG_SAVE_REG_PAIR fp, lr, #-32!
PROLOG_SAVE_REG_PAIR x19, x20, #16
ldr w3, [x0, #SymCryptModulusNdigitsOffsetArm64] ; # of Digits
ldr x5, [x0, #SymCryptModulusMontgomeryInv64OffsetArm64] ; Inv64 of modulus
add x0, x0, #SymCryptModulusValueOffsetArm64 ; pMod
lsl x4, x3, #2 ; Multiply by 4 to get the number of words
neg x3, x3 ; Negate the digit count
neg x4, x4 ; Negate the word count
mov x17, x3 ; Store the digit count for later
mov x19, x0 ; Store the pMod pointer
mov x20, x1 ; Store the pSrc pointer
ands x7, x7, XZR ; Set hc to 0
;
; Main loop
;
SymCryptFdefMontgomeryReduceAsmOuter
ldr x8, [x1] ; Load 1 word from pSrc
mul x6, x8, x5 ; <63:0> bits of pSrc[i]*Inv64 = m
ands x12, x12, XZR ; Set c to 0
ands x15, x15, XZR ; Set c to 0
SymCryptFdefMontgomeryReduceAsmInner
ldp x10, x11, [x0] ; pMod[j]
ldp x8, x9, [x1] ; pSrc[j]
mul x16, x6, x10 ; <63:0> of pMod[j]*m
adds x16, x16, x8 ; Adding pSrc[j]
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
adc x15, x15, XZR ; Add the carry if any (***)
adds x12, x12, x16 ; Add the lower bits of c
adc x15, x15, XZR ; Add the carry if any (***)
; ***: These cannot produce extra carry as the maximum is
; (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str x12, [x1] ; pSrc[j] = (UINT64) c
mul x16, x6, x11 ; <63:0> of pMod[j]*m
adds x16, x16, x9 ; Adding pSrc[j]
umulh x12, x6, x11 ; <127:64> of pMod[j]*m
adc x12, x12, XZR ; Add the carry if any (***)
adds x15, x15, x16 ; Add the lower bits of c
adc x12, x12, XZR ; Add the carry if any (***)
str x15, [x1, #8] ; pSrc[j] = (UINT64) c
ldp x10, x11, [x0, #16] ; pMod[j]
ldp x8, x9, [x1, #16] ; pSrc[j]
mul x16, x6, x10 ; <63:0> of pMod[j]*m
adds x16, x16, x8 ; Adding pSrc[j]
umulh x15, x6, x10 ; <127:64> of pMod[j]*m
adc x15, x15, XZR ; Add the carry if any (***)
adds x12, x12, x16 ; Add the lower bits of c
adc x15, x15, XZR ; Add the carry if any (***)
str x12, [x1, #16] ; pSrc[j] = (UINT64) c
mul x16, x6, x11 ; <63:0> of pMod[j]*m
adds x16, x16, x9 ; Adding pSrc[j]
umulh x12, x6, x11 ; <127:64> of pMod[j]*m
adc x12, x12, XZR ; Add the carry if any (***)
adds x15, x15, x16 ; Add the lower bits of c
adc x12, x12, XZR ; Add the carry if any (***)
str x15, [x1, #24] ; pSrc[j] = (UINT64) c
add x0, x0, #32
add x1, x1, #32
adds x3, x3, #1 ; Move one digit up
bne SymCryptFdefMontgomeryReduceAsmInner
ldr x8, [x1] ; pSrc[nWords]
adds x12, x12, x8 ; c + pSrc[nWords]
adc x15, XZR, XZR ; Add the carry if any
adds x12, x12, x7 ; c + pSrc[nWords] + hc
adc x7, x15, XZR ; Add the carry if any and store into hc
str x12, [x1] ; pSrc[nWords] = c
adds x4, x4, #1 ; Move one word up
add x20, x20, #8 ; Move stored pSrc pointer one word up
mov x0, x19 ; Restore pMod pointer
mov x1, x20 ; Restore pSrc pointer
mov x3, x17 ; Restore the digit counter
bne SymCryptFdefMontgomeryReduceAsmOuter
;
; Subtraction
;
mov x16, x2 ; Store pDst pointer
; Prepare the pointers for subtract
mov x0, x20 ; pSrc
mov x1, x19 ; pMod
mov x10, x7 ; x10 = hc
mov x3, x17 ; Restore the digit counter
subs x4, x4, x4 ; Set the carry flag (i.e. no borrow)
SymCryptFdefMontgomeryReduceRawSubAsmLoop
add x3, x3, #1 ; Increment the digit count by one
; borrow is in the carry flag (flipped)
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
ldp x4, x6, [x0], #16 ; Load two words of pSrc1
ldp x5, x7, [x1], #16 ; Load two words of pSrc2
sbcs x4, x4, x5
sbcs x6, x6, x7
stp x4, x6, [x2], #16 ; Store the result in the destination
cbnz x3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
csetcc x0 ; If the carry is clear (borrow), set the return value to 1
orr x11, x10, x0 ; x11 = hc|d
; Prepare the pointers for masked copy
mov x0, x20 ; pSrc
mov x1, x16 ; pDst
mov x2, x17 ; Restore the digit counter
subs x4, x10, x11 ; If (x11 > x10) clear the carry flag (i.e. borrow)
SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
add x2, x2, #1 ; Increment the digit count by one
ldp x4, x6, [x0], #16 ; Load two words of the source
ldp x5, x7, [x1] ; Load two words of the destination
cselcc x4, x4, x5 ; If the carry is clear, select the source operands
cselcc x6, x6, x7
stp x4, x6, [x1], #16 ; Store the two words in the destination
ldp x4, x6, [x0], #16
ldp x5, x7, [x1]
cselcc x4, x4, x5
cselcc x6, x6, x7
stp x4, x6, [x1], #16
cbnz x2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
; Done, no return value
EPILOG_RESTORE_REG_PAIR x19, x20, #16
EPILOG_RESTORE_REG_PAIR fp, lr, #32!
EPILOG_RETURN
NESTED_END ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm)
END

Просмотреть файл

@ -0,0 +1,705 @@
//
// fdef_asm.symcryptasm Assembler code for large integer arithmetic in the default data format for the arm64 architecture
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#include "symcryptasm_shared.cppasm"
// A digit consists of 4 words of 64 bits each
//UINT32
//SYMCRYPT_CALL
//SymCryptFdefRawAddAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
// UINT32 nDigits )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm), 4, 8)
ldp X_4, X_6, [X_0] // Load two words of pSrc1
ldp X_5, X_7, [X_1] // Load two words of pSrc2
adds X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2] // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
adcs X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
cbz X_3, SymCryptFdefRawAddAsmEnd
LABEL(SymCryptFdefRawAddAsmLoop)
// carry is in the carry flag
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
adcs X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
adcs X_4, X_4, X_5
adcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdefRawAddAsmLoop
ALIGN(4)
LABEL(SymCryptFdefRawAddAsmEnd)
cset X_0, cs // Set the return value equal to the carry
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawAddAsm))
//UINT32
//SYMCRYPT_CALL
//SymCryptFdefRawSubAsm(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
// UINT32 nDigits )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm), 4, 8)
ldp X_4, X_6, [X_0] // Load two words of pSrc1
ldp X_5, X_7, [X_1] // Load two words of pSrc2
subs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2] // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
cbz X_3, SymCryptFdefRawSubAsmEnd
LABEL(SymCryptFdefRawSubAsmLoop)
// borrow is in the carry flag (flipped)
// only update pointers to srcs and destination once per loop to reduce uops and dependencies
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
sub X_3, X_3, #1 // Decrement the digit count by one
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdefRawSubAsmLoop
ALIGN(4)
LABEL(SymCryptFdefRawSubAsmEnd)
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSubAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdefMaskedCopyAsm(
// _In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
// _Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
// UINT32 nDigits,
// UINT32 mask )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm), 4, 4)
dup v0.4s, W_3 // broadcast the mask to v0
LABEL(SymCryptFdefMaskedCopyAsmLoop)
ldp q1, q3, [X_0], #32 // Load 4 words of the source
ldp q2, q4, [X_1] // Load 4 words of the destination
bit v2.16b, v1.16b, v0.16b // if the mask is 1s, overwrite the destination with source
bit v4.16b, v3.16b, v0.16b // if the mask is 1s, overwrite the destination with source
stp q2, q4, [X_1], #32 // Store the two words in the destination
sub X_2, X_2, #1 // Decrement the digit count by one
cbnz X_2, SymCryptFdefMaskedCopyAsmLoop
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMaskedCopyAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdefRawMulAsm(
// _In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
// UINT32 nDigits1,
// _In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
// UINT32 nDigits2,
// _Out_writes_((nDigits1+nDigits2)*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
//
// Basic structure:
// for each word in Src1:
// Dst += Src2 * word
//
// Register assignments
// X_0 = pSrc1 (moving forward one word every outer loop)
// X_1 = word count of pSrc1
// X_2 = pSrc2 (moving forward one *digit* every inner loop)
// X_3 = digit count of pSrc2 and pDst
// X_4 = pDst (moving forward one *digit* every inner loop)
// X_5 = Stored pDst (moving forward one word every outer loop)
// X_6 = Current word loaded from pSrc1
// X_7, X_8 = Current words loaded in pairs from pSrc2
// X_9, X_10 = Current words loaded in pairs from pDst
// X_11, X_12 = Scratch registers for holding the results of multiplies
// X_13 = Stored pSrc2
// X_14 = Stored digit count of pSrc2
// X_15 = Scratch register for holding the results of multiplies
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm), 5, 16)
lsl X_1, X_1, #2 // Calculate word count
sub X_2, X_2, #32 // offset pSrc2 so we can use pre-increment form of loads
sub X_4, X_4, #32 // offset pDst so we can use pre-increment form of loads
mov X_5, X_4 // store pDst
mov X_13, X_2 // store pSrc2
mov X_14, X_3 // store nDigits2 for later
//
// First iteration of main loop (no adding of previous values from pDst)
//
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0] // load the first word from pSrc1
LABEL(SymCryptFdefRawMulAsmLoopInner1)
sub X_3, X_3, #1 // move one digit up
ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j]
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j]
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+1]
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+1]
stp X_11, X_15, [X_4, #32]! // Store to destination
ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[0]*pSrc2[j+2]
adcs X_11, X_11, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_7 // Bits <127:64> of pSrc1[0]*pSrc2[j+2]
mul X_15, X_6, X_8 // Bits <63:0> of pSrc1[0]*pSrc2[j+3]
adcs X_15, X_15, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[0]*pSrc2[j+3]
stp X_11, X_15, [X_4, #16] // Store to destination
cbnz X_3, SymCryptFdefRawMulAsmLoopInner1
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
str X_12, [X_4, #32]
sub X_1, X_1, #1 // move one word up
add X_0, X_0, #8 // move start of pSrc1 one word up
add X_5, X_5, #8 // move start of pDst one word up
//
// MAIN LOOP
//
LABEL(SymCryptFdefRawMulAsmLoopOuter)
mov X_3, X_14 // set nDigits2
mov X_2, X_13 // set pSrc2
mov X_4, X_5 // set pDst
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0] // load the next word from pSrc1
LABEL(SymCryptFdefRawMulAsmLoopInner)
sub X_3, X_3, #1 // move one digit up
ldp X_7, X_8, [X_2, #32]! // load 2 words from pSrc2
ldp X_9, X_10, [X_4, #32]! // load 2 words from pDst
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j]
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+1]
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j]
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+1]
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
stp X_9, X_10, [X_4] // Store to destination
ldp X_7, X_8, [X_2, #16] // load 2 words from pSrc2
ldp X_9, X_10, [X_4, #16] // load 2 words from pDst
adcs X_9, X_9, X_12 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_11, X_6, X_7 // Bits <127:64> of pSrc1[i]*pSrc2[j+2]
adcs X_10, X_11, X_10 // Adding the previous word (if there was a carry from the last addition it is added)
umulh X_12, X_6, X_8 // Bits <127:64> of pSrc1[i]*pSrc2[j+3]
adc X_12, X_12, xzr // Add the carry if any and don't update the flags
mul X_11, X_6, X_7 // Bits <63:0> of pSrc1[i]*pSrc2[j+2]
adds X_9, X_9, X_11 // add the word from the destination and update the flags (this can overflow)
mul X_11, X_6, X_8 // Bits <63:0> of pSrc1[i]*pSrc2[j+3]
adcs X_10, X_10, X_11 // add the word from the destination and update the flags (this can overflow)
stp X_9, X_10, [X_4, #16] // Store to destination
cbnz X_3, SymCryptFdefRawMulAsmLoopInner
adc X_12, X_12, xzr // Store the next word into the destination (with the carry if any)
str X_12, [X_4, #32]
subs X_1, X_1, #1 // move one word up
add X_0, X_0, #8 // move start of pSrc1 one word up
add X_5, X_5, #8 // move start of pDst one word up
bne SymCryptFdefRawMulAsmLoopOuter
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawMulAsm))
// Macro for the first loop of the first pass of RawSquareAsm.
// It takes one word from the source, multiplies it with the mulword,
// adds the high level word of the previous macro call, and stores it into
// the destination.
//
// No carry flag is propagated from the previous macro call as the maximum is
// (2^64-1)^2 + 2^64-1 = 2^128 - 2^64
MACRO_START(SQR_SINGLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1)
ldr scratch0, [src_reg, #8*index] // pSrc[i+j]
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
adds scratch1, scratch1, src_carry // Adding the previous word
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
str scratch1, [dst_reg, #8*index] // Store to destination
MACRO_END()
// Macro for the remaining loops of the first pass of RawSquareAsm.
// The only difference to the above is that it also adds the word loaded
// from the destination buffer.
//
// No carry flag is propagated from the previous macro call as the maximum is
// (2^64-1)^2 + 2(2^64-1) = 2^128 - 1
MACRO_START(SQR_DOUBLEADD_64, index, src_reg, dst_reg, mul_word, src_carry, dst_carry, scratch0, scratch1, scratch2)
ldr scratch0, [src_reg, #8*index] // pSrc[i+j]
ldr scratch2, [dst_reg, #8*index] // pDst[2*(i+j)]
mul scratch1, mul_word, scratch0 // Bits <63:0> of pSrc[i]*pSrc[i+j]
adds scratch1, scratch1, src_carry // Adding the previous word
umulh dst_carry, mul_word, scratch0 // Bits <127:64> of pSrc[i]*pSrc[i+j]
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
adds scratch1, scratch1, scratch2 // Add the word from the destination
adc dst_carry, dst_carry, xzr // Add the intermediate carry and don't update the flags
str scratch1, [dst_reg, #8*index] // Store to destination
MACRO_END()
// Macro for the third pass loop of RawSquareAsm.
// It takes one mulword from the source, squares it, and
// adds it to the even columns of the destination. The carries are propagated
// to the odd columns.
//
// Here we can have a (1-bit) carry to the next call because the maximum value for
// a pair of columns is (2^64-1)^2+(2^128-1)+1 = 2^129 - 2^65 + 1 < 2^129 - 1
MACRO_START(SQR_DIAGONAL_PROP, index, src_reg, dst_reg, squarelo, squarehi, scratch0, scratch1)
ldr squarehi, [src_reg, #8*index] // mulword
mul squarelo, squarehi, squarehi // Bits <63:0> of m^2
umulh squarehi, squarehi, squarehi // Bits <127:64> of m^2
ldp scratch0, scratch1, [dst_reg, #16*index] // Load
// Adding the square to the even column
adcs squarelo, squarelo, scratch0 // carry from previous and update the flags
// Propagating the sum to the next column
adcs squarehi, squarehi, scratch1 // This can generate a carry
stp squarelo, squarehi, [dst_reg, #16*index] // Store
MACRO_END()
//VOID
//SYMCRYPT_CALL
//SymCryptFdefRawSquareAsm(
// _In_reads_(nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
// UINT32 nDigits,
// _Out_writes_(2*nDigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PUINT32 pDst )
//
// Register assignments
// X_0 = pSrc
// X_1 = word count of pSrc
// X_2 = pSrc (moving forward one digit / 4 words every inner loop)
// X_3 = digit count of pSrc
// X_4 = pDst (moving forward one digit every inner loop)
// X_5 = pDst (moving forward one word every outer loop)
// X_6 = Current word loaded from pSrc
// X_7, X_8 = Current words loaded in pairs from pSrc2
// X_9, X_10 = Current words loaded in pairs from pDst
// X_11, X_12 = "128-bit" sliding register to hold the result of multiplies
// X_13 = Stored pSrc
// X_14 = Digit count of pSrc
// X_15 = Stored digit count of pSrc
// X_16 = Stored pDst
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm), 3, 17)
mov X_3, X_1 // digit count into X_3
lsl X_1, X_1, #2 // Calculate word count
mov X_4, X_2 // pDst
mov X_5, X_2 // store pDst
mov X_16, X_2 // store pDst
mov X_13, X_0 // store pSrc
mov X_2, X_0 // inner loop pSrc
mov X_14, X_3 // store nDigits for later
mov X_15, X_3 // store nDigits for later
//
// First iteration of main loop (no adding of previous values from pDst)
//
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0] // load the first word from pSrc1
str X_12, [X_4] // store 0 for the first word
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word0)
SQR_SINGLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_8
LABEL(SymCryptFdefRawSquareAsmInnerLoopInit_Word1)
SQR_SINGLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_8
SQR_SINGLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_8
SQR_SINGLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_8
sub X_3, X_3, #1 // move one digit up
add X_2, X_2, #32
add X_4, X_4, #32
cbnz X_3, SymCryptFdefRawSquareAsmInnerLoopInit_Word0
str X_12, [X_4] // Store the next word into the destination
sub X_1, X_1, #2 // move two words up (we started at the word 1)
mov X_8, #1 // Cyclic counter
//
// MAIN LOOP
//
LABEL(SymCryptFdefRawSquareAsmOuterLoop)
add X_5, X_5, #8 // move start of pDst one word up
mov X_3, X_14 // set nDigits
mov X_2, X_0 // set pSrc
mov X_4, X_5 // set pDst
ands X_12, X_12, xzr // Clearing the carry flag and setting X_12 = 0
ldr X_6, [X_0, X_8, LSL #3] // load the next word from pSrc
// Cyclic counter and jump logic
add X_8, X_8, #1
cmp X_8, #1
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
cmp X_8, #2
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
cmp X_8, #3
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
// The following instructions are only executed when X_8 == 4
mov X_8, xzr // Set it to 0
add X_0, X_0, #32 // move start of pSrc 4 words up
add X_5, X_5, #32 // move pDst 4 words up
mov X_2, X_0 // set pSrc
mov X_4, X_5 // set pDst
sub X_14, X_14, #1 // remove 1 digit
mov X_3, X_14 // set the new digit counter
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word0)
SQR_DOUBLEADD_64 0, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word1)
SQR_DOUBLEADD_64 1, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word2)
SQR_DOUBLEADD_64 2, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
LABEL(SymCryptFdefRawSquareAsmInnerLoop_Word3)
SQR_DOUBLEADD_64 3, X_2, X_4, X_6, X_12, X_12, X_7, X_9, X_10
sub X_3, X_3, #1 // move one digit up
add X_2, X_2, #32
add X_4, X_4, #32
cbnz X_3, SymCryptFdefRawSquareAsmInnerLoop_Word0
str X_12, [X_4] // Store the next word into the destination
sub X_1, X_1, #1 // move one word up
cbnz X_1, SymCryptFdefRawSquareAsmOuterLoop
ands X_12, X_12, xzr // Setting X_12 = 0
str X_12, [X_5, #40] // Store 0 to destination for the top word
////////////////////////////////////////////////////////////////
// Second Pass - Shifting all results 1 bit left
////////////////////////////////////////////////////////////////
mov X_3, X_15 // nDigits
lsl X_3, X_3, #1 // Double digits
mov X_4, X_16 // pDst pointer
ands X_7, X_7, xzr // Clear the flags
LABEL(SymCryptFdefRawSquareAsmSecondPass)
sub X_3, X_3, #1 // move one digit up
ldp X_7, X_8, [X_4]
adcs X_7, X_7, X_7 // Shift left and add the carry
adcs X_8, X_8, X_8
stp X_7, X_8, [X_4], #16
ldp X_9, X_10, [X_4]
adcs X_9, X_9, X_9 // Shift left and add the carry
adcs X_10, X_10, X_10
stp X_9, X_10, [X_4], #16
cbnz X_3, SymCryptFdefRawSquareAsmSecondPass
//////////////////////////////////////////////////////////////////////////////
// Third Pass - Adding the squares on the even columns and propagating the sum
//////////////////////////////////////////////////////////////////////////////
ands X_7, X_7, xzr // Clear the flags
mov X_0, X_13 // src pointer
mov X_4, X_16 // pDst pointer
mov X_3, X_15 // nDigits
LABEL(SymCryptFdefRawSquareAsmThirdPass)
SQR_DIAGONAL_PROP 0, X_0, X_4, X_6, X_7, X_8, X_9
SQR_DIAGONAL_PROP 1, X_0, X_4, X_6, X_7, X_8, X_9
SQR_DIAGONAL_PROP 2, X_0, X_4, X_6, X_7, X_8, X_9
SQR_DIAGONAL_PROP 3, X_0, X_4, X_6, X_7, X_8, X_9
sub X_3, X_3, #1 // move one digit up
add X_0, X_0, #32 // One digit up (not updated in SQR_DIAGONAL_PROP)
add X_4, X_4, #64 // Two digits up (not updated in SQR_DIAGONAL_PROP)
cbnz X_3, SymCryptFdefRawSquareAsmThirdPass
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefRawSquareAsm))
//VOID
//SYMCRYPT_CALL
//SymCryptFdefMontgomeryReduceAsm(
// _In_ PCSYMCRYPT_MODULUS pmMod,
// _Inout_ PUINT32 pSrc,
// _Out_ PUINT32 pDst )
//
// Register assignments
// X_0 = pMod (moving forward one *digit* every inner loop)
// X_1 = pSrc (moving forward one *digit* every inner loop)
// X_2 = pDst (used only in the end for subtract / result)
// X_3 = digit count of pSrc and pMod
// X_4 = word count of pSrc
// X_5 = Inv64 of the modulus
// X_6 = m = pSrc[i]*Inv64
// X_7 = hc = high carry variable
// X_8, X_9 = Current words loaded in pairs from pSrc
// X_10, X_11 = Current words loaded in pairs from pMod
// X_12, X_13 = c variable = "128-bit" register to hold the result of multiplies
// It is flipped between [X_12:X_13] and [X_13:X_12] instead of doing c>>=64
// X_14 = Temporary intermediate result
// X_15 = Stored digit count of pSrc
// X_16 = Stored pMod pointer
// X_17 = Stored pSrc pointer (moving forward one word every outer loop)
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm), 3, 18)
ldr W_3, [X_0, #SymCryptModulusNdigitsOffsetArm64] // # of Digits
ldr X_5, [X_0, #SymCryptModulusMontgomeryInv64OffsetArm64] // Inv64 of modulus
add X_0, X_0, #SymCryptModulusValueOffsetArm64 // pMod
lsl X_4, X_3, #2 // Multiply by 4 to get the number of words
sub X_0, X_0, #32 // offset pMod so we can use pre-increment form of loads
sub X_1, X_1, #32 // offset pSrc so we can use pre-increment form of loads
sub X_2, X_2, #32 // offset pDst so we can use pre-increment form of loads
mov X_15, X_3 // Store the digit count for later
mov X_16, X_0 // Store the pMod pointer
mov X_17, X_1 // Store the pSrc pointer
and X_7, X_7, xzr // Set hc to 0
//
// Main loop
//
LABEL(SymCryptFdefMontgomeryReduceAsmOuter)
ldr X_8, [X_1, #32] // Load 1 word from pSrc
mul X_6, X_8, X_5 // <63:0> bits of pSrc[i]*Inv64 = m
and X_12, X_12, xzr // Set c to 0
LABEL(SymCryptFdefMontgomeryReduceAsmInner)
ldp X_10, X_11, [X_0, #32]! // pMod[j]
ldp X_8, X_9, [X_1, #32]! // pSrc[j]
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
// ***: These cannot produce extra carry as the maximum is
// (2^64 - 1)*(2^64-1) + 2^64-1 + 2^64-1 = 2^128 - 1
str X_12, [X_1] // pSrc[j] = (UINT64) c
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
adc X_12, X_12, xzr // Add the carry if any (***)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #8] // pSrc[j] = (UINT64) c
ldp X_10, X_11, [X_0, #16] // pMod[j]
ldp X_8, X_9, [X_1, #16] // pSrc[j]
mul X_14, X_6, X_10 // <63:0> of pMod[j]*m
adds X_14, X_14, X_8 // Adding pSrc[j]
umulh X_13, X_6, X_10 // <127:64> of pMod[j]*m
adc X_13, X_13, xzr // Add the carry if any (***)
adds X_12, X_12, X_14 // Add the lower bits of c
adc X_13, X_13, xzr // Add the carry if any (***)
str X_12, [X_1, #16] // pSrc[j] = (UINT64) c
mul X_14, X_6, X_11 // <63:0> of pMod[j]*m
adds X_14, X_14, X_9 // Adding pSrc[j]
umulh X_12, X_6, X_11 // <127:64> of pMod[j]*m
adc X_12, X_12, xzr // Add the carry if any (***)
adds X_13, X_13, X_14 // Add the lower bits of c
adc X_12, X_12, xzr // Add the carry if any (***)
str X_13, [X_1, #24] // pSrc[j] = (UINT64) c
subs X_3, X_3, #1 // Move one digit up
bne SymCryptFdefMontgomeryReduceAsmInner
ldr X_8, [X_1, #32] // pSrc[nWords]
adds X_12, X_12, X_8 // c + pSrc[nWords]
adc X_13, xzr, xzr // Add the carry if any
adds X_12, X_12, X_7 // c + pSrc[nWords] + hc
adc X_7, X_13, xzr // Add the carry if any and store into hc
str X_12, [X_1, #32] // pSrc[nWords] = c
subs X_4, X_4, #1 // Move one word up
add X_17, X_17, #8 // Move stored pSrc pointer one word up
mov X_0, X_16 // Restore pMod pointer
mov X_1, X_17 // Restore pSrc pointer
mov X_3, X_15 // Restore the digit counter
bne SymCryptFdefMontgomeryReduceAsmOuter
//
// Subtraction
//
mov X_14, X_2 // Store pDst pointer
// Prepare the pointers for subtract
mov X_0, X_17 // pSrc
mov X_1, X_16 // pMod
mov X_10, X_7 // X_10 = hc
mov X_3, X_15 // Restore the digit counter
subs X_4, X_4, X_4 // Set the carry flag (i.e. no borrow)
LABEL(SymCryptFdefMontgomeryReduceRawSubAsmLoop)
sub X_3, X_3, #1 // Decrement the digit count by one
// borrow is in the carry flag (flipped)
ldp X_4, X_6, [X_0, #32]! // Load two words of pSrc1
ldp X_5, X_7, [X_1, #32]! // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #32]! // Store the result in the destination
ldp X_4, X_6, [X_0, #16] // Load two words of pSrc1
ldp X_5, X_7, [X_1, #16] // Load two words of pSrc2
sbcs X_4, X_4, X_5
sbcs X_6, X_6, X_7
stp X_4, X_6, [X_2, #16] // Store the result in the destination
cbnz X_3, SymCryptFdefMontgomeryReduceRawSubAsmLoop
cset X_0, cc // If the carry is clear (borrow), set the return value to 1
orr X_11, X_10, X_0 // X_11 = hc|d
// Prepare the pointers for masked copy
mov X_0, X_17 // pSrc
mov X_1, X_14 // pDst
mov X_2, X_15 // Restore the digit counter
subs X_4, X_10, X_11 // If (X_11 > X_10) clear the carry flag (i.e. borrow)
LABEL(SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop)
sub X_2, X_2, #1 // decrement the digit count by one
ldp X_4, X_6, [X_0, #32]! // Load two words of the source
ldp X_5, X_7, [X_1, #32]! // Load two words of the destination
csel X_4, X_4, X_5, cc // If the carry is clear, select the source operands
csel X_6, X_6, X_7, cc
stp X_4, X_6, [X_1] // Store the two words in the destination
ldp X_4, X_6, [X_0, #16]
ldp X_5, X_7, [X_1, #16]
csel X_4, X_4, X_5, cc
csel X_6, X_6, X_7, cc
stp X_4, X_6, [X_1, #16]
cbnz X_2, SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
// Done, no return value
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptFdefMontgomeryReduceAsm))
FILE_END()

Просмотреть файл

@ -1,28 +0,0 @@
;
; SymCrypt_magic.inc
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
; Include file to define the support macros for the Magic field
;
IMPORT ARM64EC_NAME_MANGLE(SymCryptFatal)
#define SYMCRYPT_CODE_VERSION (SYMCRYPT_CODE_VERSION_API * 65536 + SYMCRYPT_CODE_VERSION_MINOR)
#define SYMCRYPT_MAGIC_CONSTANT (0x53316d76 + SYMCRYPT_CODE_VERSION)
MACRO
SYMCRYPT_CHECK_MAGIC $temp1, $temp2, $ptr, $offset
#if SYMCRYPT_DEBUG
ldr $temp1, [$ptr, #$offset]
subs $temp1, $temp1, $ptr
mov32 $temp2, SYMCRYPT_MAGIC_CONSTANT
cmp $temp1, $temp2
beq %F1
mov32 r0, 0x6d616763 ; 'magc'
bl ARM64EC_NAME_MANGLE(SymCryptFatal)
1
#endif
MEND

Просмотреть файл

@ -1,37 +0,0 @@
TTL "SymCryptWipe"
;++
;
; Copyright (c) Microsoft Corporation. Licensed under the MIT license.
;
; Secure wipe
;
;--
#include "ksarm64.h"
#include "symcrypt_name_mangling.inc"
TEXTAREA
EXTERN ARM64EC_NAME_MANGLE(memset)
SUBT "SymCryptWipe"
;VOID
;SYMCRYPT_CALL
;SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
; SIZE_T cbData )
LEAF_ENTRY ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
; we just jump to memset.
; this is enough to stop the compiler optimizing the memset away.
mov x2, x1
mov x1, #0
b ARM64EC_NAME_MANGLE(memset)
LEAF_END ARM64EC_NAME_MANGLE(SymCryptWipeAsm)
END

Просмотреть файл

@ -0,0 +1,31 @@
//
// wipe.symcryptasm Assembler code for wiping a buffer
// Expresses asm in a generic enough way to enable generation of MASM and GAS using the
// symcryptasm_processor.py script and C preprocessor
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
#include "symcryptasm_shared.cppasm"
TEXTAREA()
EXTERN(ARM64EC_NAME_MANGLE(memset))
//VOID
//SYMCRYPT_CALL
//SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
// SIZE_T cbData )
FUNCTION_START(ARM64EC_NAME_MANGLE(SymCryptWipeAsm), 2, 3)
// we just jump to memset.
// this is enough to stop the compiler optimizing the memset away.
mov X_2, X_1
mov X_1, #0
b ARM64EC_NAME_MANGLE(memset)
FUNCTION_END(ARM64EC_NAME_MANGLE(SymCryptWipeAsm))
FILE_END()

Просмотреть файл

@ -1 +0,0 @@
#include "..\arm64\fdef369_asm.asm"

Просмотреть файл

@ -1 +0,0 @@
#include "..\arm64\fdef_asm.asm"

Просмотреть файл

@ -1 +0,0 @@
#include "..\arm64\symcrypt_magic.inc"

Просмотреть файл

@ -1 +0,0 @@
#include "..\arm64\symcrypt_name_mangling.inc"

Просмотреть файл

@ -1 +0,0 @@
#include "..\arm64\wipe.asm"

Просмотреть файл

@ -1,132 +0,0 @@
//
// asmstubs.c
// Temporary forwarders for ASM implementations which we don't yet support with GCC/LLVM on Arm64
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
#include "../precomp.h"
VOID
SYMCRYPT_CALL
SymCryptWipeAsm( _Out_writes_bytes_( cbData ) PVOID pbData, SIZE_T cbData )
{
volatile BYTE * p = (volatile BYTE *) pbData;
SIZE_T i;
for( i=0; i<cbData; i++ ){
p[i] = 0;
}
}
VOID
SYMCRYPT_CALL
SymCryptFdefMaskedCopyC(
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
UINT32 nDigits,
UINT32 mask );
VOID
SYMCRYPT_CALL
SymCryptFdefMaskedCopyAsm(
_In_reads_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
_Inout_updates_bytes_( nDigits*SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
UINT32 nDigits,
UINT32 mask )
{
SymCryptFdefMaskedCopyC( pbSrc, pbDst, nDigits, mask );
}
UINT32
SYMCRYPT_CALL
SymCryptFdefRawAddC(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits );
UINT32
SYMCRYPT_CALL
SymCryptFdefRawAddAsm(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits )
{
return SymCryptFdefRawAddC( pSrc1, pSrc2, pDst, nDigits );
}
UINT32
SYMCRYPT_CALL
SymCryptFdefRawSubC(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits );
UINT32
SYMCRYPT_CALL
SymCryptFdefRawSubAsm(
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
_In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
_Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
UINT32 nDigits )
{
return SymCryptFdefRawSubC( pSrc1, pSrc2, pDst, nDigits );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulC(
_In_reads_(nWords1) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nWords2) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawMulAsm(
_In_reads_(nDigits1*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc1,
UINT32 nDigits1,
_In_reads_(nDigits2*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc2,
UINT32 nDigits2,
_Out_writes_(nWords1 + nWords2) PUINT32 pDst )
{
SymCryptFdefRawMulC( pSrc1, nDigits1, pSrc2, nDigits2, pDst );
}
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareC(
_In_reads_(nWords) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefRawSquareAsm(
_In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
UINT32 nDigits,
_Out_writes_(2*nWords) PUINT32 pDst )
{
SymCryptFdefRawSquareC( pSrc, nDigits, pDst );
}
VOID
SymCryptFdefMontgomeryReduceC(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Out_ PUINT32 pDst );
VOID
SYMCRYPT_CALL
SymCryptFdefMontgomeryReduceAsm(
_In_ PCSYMCRYPT_MODULUS pmMod,
_In_ PUINT32 pSrc,
_Out_ PUINT32 pDst )
{
SymCryptFdefMontgomeryReduceC( pmMod, pSrc, pDst );
}

Просмотреть файл

@ -4,9 +4,18 @@
# Preprocess amd64 .symcryptasm into masm
{amd64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\amd64\}.asm:
..\scripts\symcryptasm_processor.py masm $< $(OBJ_PATH)\$(O)\$(<B).cppasm
..\scripts\symcryptasm_processor.py masm amd64 msft $< $(OBJ_PATH)\$(O)\$(<B).cppasm
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_AMD64 /DSYMCRYPT_MASM /Fi$@ $(OBJ_PATH)\$(O)\$(<B).cppasm
# Preprocess x86 .cppasm into masm
{i386\}.cppasm{$(OBJ_PATH)\$(O)\..\i386\}.asm:
$(CC) $(CFLAGS) /EP /P /I..\inc\ /I.\ /DSYMCRYPT_CPU_X86 /DSYMCRYPT_MASM /Fi$@ $<
# Preprocess arm64 .symcryptasm into masm
{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\}.asm:
..\scripts\symcryptasm_processor.py armasm64 arm64 aapcs64 $< $(OBJ_PATH)\$(O)\$(<B).asm
# Preprocess arm64ec .symcryptasm into masm
{arm64\}.symcryptasm{$(OBJ_PATH)\$(O)\..\arm64\arm64ec\}.asm:
..\scripts\symcryptasm_processor.py armasm64 arm64 arm64ec $< $(OBJ_PATH)\$(O)\arm64ec\$(<B).asm

Просмотреть файл

@ -18,8 +18,13 @@ GUARD = 1 # enable CFG
ENABLE_ASM_RETPOLINE = 1
ENABLE_RETPOLINE_LINKER_WARNING = 1
# Enable /Gy for all assembler code
ASM_DEFINES=$(ASM_DEFINES) /Gy
# Enable /Gy for all assembler code, and some additional symcryptasm definitions for Arm64 assembler code
ASM_DEFINES=\
!IF "$(_BUILDARCH)" == "arm64"
$(ASM_DEFINES) /Gy /DSYMCRYPT_CPU_ARM64 /DSYMCRYPT_MASM
!ELSE
$(ASM_DEFINES) /Gy
!ENDIF
USE_MAKEFILE_INC = 1
@ -35,6 +40,15 @@ NTTARGETFILE0=\
!ELSEIF "$(_BUILDARCH)" == "x86"
$(OBJ_PATH)\$(O)\..\i386\aesasm.asm \
$(OBJ_PATH)\$(O)\..\i386\fdef_asm.asm \
!ELSEIF "$(_BUILDARCH)" == "arm64"
$(OBJ_PATH)\$(O)\..\arm64\fdef_asm.asm \
$(OBJ_PATH)\$(O)\..\arm64\fdef369_asm.asm \
$(OBJ_PATH)\$(O)\..\arm64\wipe.asm \
!IF "$(ARM64X_EC_ENABLED)" == "1"
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef_asm.asm \
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\fdef369_asm.asm \
$(OBJ_PATH)\$(O)\..\arm64\arm64ec\wipe.asm \
!ENDIF
!ENDIF
INCLUDES= \

Просмотреть файл

@ -9,6 +9,10 @@
#if defined(SYMCRYPT_CPU_AMD64)
include ksamd64.inc
#elif defined(SYMCRYPT_CPU_ARM64)
#include "ksarm64.h"
#include "arm64/symcrypt_name_mangling.inc"
#include "symcrypt_version.inc"
#endif
#include "C_asm_shared.inc"
@ -17,17 +21,29 @@ include ksamd64.inc
#define ALIGN(__alignment) align __alignment
#define GET_SYMBOL_ADDRESS(__symbol) __symbol
#define HEX(__constant) __constant##h
#define TEXTAREA() TEXTAREA
#define EXTERN(__label) EXTERN __label
#define LABEL(__labelname) __labelname
#elif defined(SYMCRYPT_GAS)
#if defined(SYMCRYPT_CPU_AMD64)
.intel_syntax noprefix
#elif defined(SYMCRYPT_CPU_ARM64)
#include "arm64/symcrypt_name_mangling.inc"
#include "symcrypt_version.inc"
#endif
#include "C_asm_shared.inc"
#define FILE_END()
#define ALIGN(__alignment) .align __alignment
#define GET_SYMBOL_ADDRESS(__symbol) __symbol+rip
#define HEX(__constant) 0x##__constant
#define TEXTAREA()
#define EXTERN(__label)
#define LABEL(__labelname) __labelname:
#else

Просмотреть файл

@ -5,20 +5,26 @@ environments without requiring forking or duplication of source files - symcrypt
assembly in an assembler and environment agnostic way.
The current target assemblers are:
MASM and GAS
MASM, GAS, and armasm64 (Arm64 assembler which ships with MSVC)
The current target environments are:
amd64 Windows (using the Microsoft x64 calling convention), and
amd64 Linux (using the SystemV amd64 calling convention)
amd64 Windows (using the Microsoft x64 calling convention),
amd64 Linux (using the SystemV amd64 calling convention),
arm64 Windows (using the aapcs64 calling convention),
arm64 Windows (using the arm64ec calling convention), and
arm64 Linux (using the aapcs64 calling convention)
Currently we only support functions with up to 6 arguments, and only amd64, but the plan is to
rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as appropriate to enable
this effort.
The processing of symcryptasm files takes place in 2 passes. The first pass is performed by this
symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm file.
The plan is to rephrase all remaining .asm in SymCrypt as symcryptasm, extending support as
appropriate to enable this effort.
Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
file.
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
substitutions, outputting a .asm file which can be assembled by the target assembler for the target
environment.
The exception is when using the armasm64 assembler, which uses the C preprocessor before assembling
its inputs already; so the output of this script is directly assembled by armasm64.
We have set up the intermediate generated files to be created in the output directories in both
razzle and CMake builds.
@ -42,6 +48,7 @@ FUNCTION_START macro which currently takes 3 arguments:
These arguments will be accessible in some contiguous region of the symcrypt registers at the
start of the function
On amd64 this contiguous region is R1..R<arg_count>
On arm64 this contiguous region is R0..R<arg_count-1>
Note: arg_count need not correspond to the exact number of argument in the function declaration
if the assembly does not use some tail of the arguments
3) The number of registers (reg_count) that the function uses
@ -58,6 +65,7 @@ At the function end an epilogue is generated with restores the non-volatile regi
A nested function (a function which does call another function) is specified similarly, only using
NESTED_FUNCTION_START and NESTED_FUNCTION_END macros. A nested function currently updates and aligns
the stack pointer in the function prologue, and avoids use of the redzone in the SystemV ABI.
Nested functions are not currently supported for Arm64.
A macro begins with an invocation of the MACRO_START macro which takes the Macro name, and variable
@ -82,6 +90,15 @@ and QH. As rdx is used to pass arguments, its value is moved to another register
prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this case.
We currently do not support nested mul functions, as we have none of them.
### arm64 ###
We allow up to 23 registers to be addressed, with the names:
X_0-X_22 (64-bit registers) and W_0-W_22 (32-bit registers)
v0-v7 ASIMD registers may by used directly in assembly too, as in both arm64 calling conventions we
currently support, these registers are volatile so do not need any special handling
X_0 is always the result register and the first argument passed to the function.
X_1-X_7 are the arguments 2-8 passed to the function
"""
import re
@ -91,37 +108,71 @@ import logging
class Register:
"""A class to represent registers"""
def __init__(self, name64, name32, name16, name8):
def __init__(self, name64, name32, name16=None, name8=None):
self.name64 = name64
self.name32 = name32
self.name16 = name16
self.name8 = name8
# amd64 registers
REG_RAX = Register("rax", "eax", "ax", "al")
REG_RBX = Register("rbx", "ebx", "bx", "bl")
REG_RCX = Register("rcx", "ecx", "cx", "cl")
REG_RDX = Register("rdx", "edx", "dx", "dl")
REG_RSI = Register("rsi", "esi", "si", "sil")
REG_RDI = Register("rdi", "edi", "di", "dil")
REG_RSP = Register("rsp", "esp", "sp", "spl")
REG_RBP = Register("rbp", "ebp", "bp", "bpl")
REG_R8 = Register( "r8", "r8d", "r8w", "r8b")
REG_R9 = Register( "r9", "r9d", "r9w", "r9b")
REG_R10 = Register("r10", "r10d", "r10w", "r10b")
REG_R11 = Register("r11", "r11d", "r11w", "r11b")
REG_R12 = Register("r12", "r12d", "r12w", "r12b")
REG_R13 = Register("r13", "r13d", "r13w", "r13b")
REG_R14 = Register("r14", "r14d", "r14w", "r14b")
REG_R15 = Register("r15", "r15d", "r15w", "r15b")
AMD64_RAX = Register("rax", "eax", "ax", "al")
AMD64_RBX = Register("rbx", "ebx", "bx", "bl")
AMD64_RCX = Register("rcx", "ecx", "cx", "cl")
AMD64_RDX = Register("rdx", "edx", "dx", "dl")
AMD64_RSI = Register("rsi", "esi", "si", "sil")
AMD64_RDI = Register("rdi", "edi", "di", "dil")
AMD64_RSP = Register("rsp", "esp", "sp", "spl")
AMD64_RBP = Register("rbp", "ebp", "bp", "bpl")
AMD64_R8 = Register( "r8", "r8d", "r8w", "r8b")
AMD64_R9 = Register( "r9", "r9d", "r9w", "r9b")
AMD64_R10 = Register("r10", "r10d", "r10w", "r10b")
AMD64_R11 = Register("r11", "r11d", "r11w", "r11b")
AMD64_R12 = Register("r12", "r12d", "r12w", "r12b")
AMD64_R13 = Register("r13", "r13d", "r13w", "r13b")
AMD64_R14 = Register("r14", "r14d", "r14w", "r14b")
AMD64_R15 = Register("r15", "r15d", "r15w", "r15b")
# arm64 registers
ARM64_R0 = Register( "x0", "w0")
ARM64_R1 = Register( "x1", "w1")
ARM64_R2 = Register( "x2", "w2")
ARM64_R3 = Register( "x3", "w3")
ARM64_R4 = Register( "x4", "w4")
ARM64_R5 = Register( "x5", "w5")
ARM64_R6 = Register( "x6", "w6")
ARM64_R7 = Register( "x7", "w7")
ARM64_R8 = Register( "x8", "w8")
ARM64_R9 = Register( "x9", "w9")
ARM64_R10 = Register("x10", "w10")
ARM64_R11 = Register("x11", "w11")
ARM64_R12 = Register("x12", "w12")
ARM64_R13 = Register("x13", "w13")
ARM64_R14 = Register("x14", "w14")
ARM64_R15 = Register("x15", "w15")
ARM64_R16 = Register("x16", "w16")
ARM64_R17 = Register("x17", "w17")
ARM64_R18 = Register("x18", "w18")
ARM64_R19 = Register("x19", "w19")
ARM64_R20 = Register("x20", "w20")
ARM64_R21 = Register("x21", "w21")
ARM64_R22 = Register("x22", "w22")
ARM64_R23 = Register("x23", "w23")
ARM64_R24 = Register("x24", "w24")
ARM64_R25 = Register("x25", "w25")
ARM64_R26 = Register("x26", "w26")
ARM64_R27 = Register("x27", "w27")
ARM64_R28 = Register("x28", "w28")
ARM64_R29 = Register("x29", "w29") # Frame Pointer
ARM64_R30 = Register("x30", "w30") # Link Register
class CallingConvention:
"""A class to represent calling conventions"""
def __init__(self, name, architecture, mapping, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
def __init__(self, name, architecture, mapping, max_arguments, argument_registers, volatile_registers, gen_prologue_fn, gen_epilogue_fn, gen_get_memslot_offset_fn):
self.name = name
self.architecture = architecture
self.mapping = mapping
self.max_arguments = max_arguments
self.argument_registers = argument_registers
self.volatile_registers = volatile_registers
self.gen_prologue_fn = types.MethodType(gen_prologue_fn, self)
@ -139,9 +190,9 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
we refer to rdx using (Q|D|W|B)H.
"""
rdx_index = None
return_mapping = { 'H': REG_RDX }
return_mapping = { 'H': AMD64_RDX }
for (index, register) in mapping.items():
if register == REG_RDX:
if register == AMD64_RDX:
rdx_index = index
break
for (index, register) in mapping.items():
@ -156,28 +207,23 @@ def get_mul_mapping_from_normal_mapping(mapping, argument_registers):
return_mapping[index-1] = register
return return_mapping
# Calling convention constants
MAX_FUNCTION_ARGUMENT_COUNT = 6 # restrict to 6 arguments for now
MAX_FUNCTION_REGISTER_COUNT = 15
# Microsoft x64 calling convention
MAPPING_AMD64_MSFT = {
0: REG_RAX, # Result register
1: REG_RCX, # Argument 1 / volatile
2: REG_RDX, # Argument 2 / volatile
3: REG_R8, # Argument 3 / volatile
4: REG_R9, # Argument 4 / volatile
5: REG_R10, # volatile
6: REG_R11, # volatile
7: REG_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
8: REG_RDI,
9: REG_RBP,
10:REG_RBX,
11:REG_R12,
12:REG_R13,
13:REG_R14,
14:REG_R15,
0: AMD64_RAX, # Result register / volatile
1: AMD64_RCX, # Argument 1 / volatile
2: AMD64_RDX, # Argument 2 / volatile
3: AMD64_R8, # Argument 3 / volatile
4: AMD64_R9, # Argument 4 / volatile
5: AMD64_R10, # volatile
6: AMD64_R11, # volatile
7: AMD64_RSI, # All registers from rsi are non-volatile and need to be saved/restored in epi/prologue
8: AMD64_RDI,
9: AMD64_RBP,
10:AMD64_RBX,
11:AMD64_R12,
12:AMD64_R13,
13:AMD64_R14,
14:AMD64_R15,
# currently not mapping rsp
}
@ -212,11 +258,11 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, mul_fixup="", nested=Fal
prologue += mul_fixup
# put additional arguments into Q5-Q6 (we do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now)
# put additional arguments into Q5-Q6 (we do not support more than 6 arguments for now)
# stack_offset to get the 5th argument is:
# 32B of shadow space + 8B for return address + (8*#pushed registers in prologue) + shadow_space_allocation_size
stack_offset = 32 + 8 + (8*(reg_count-self.volatile_registers)) + shadow_space_allocation_size
for i in range(self.argument_registers+1, min(arg_count+1, MAX_FUNCTION_ARGUMENT_COUNT+1)):
for i in range(self.argument_registers+1, arg_count+1):
prologue += "mov Q%s, [rsp + %d]\n" % (i, stack_offset)
stack_offset += 8
return prologue
@ -247,7 +293,7 @@ def gen_epilogue_amd64_msft_nested(self, arg_count, reg_count):
def gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=False):
# only support 4 memory slots for now (in shadow space)
if(slot >= 4):
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
exit(1)
# 8B for return address + (8*#pushed registers in prologue)
stack_offset = 8 + (8*(reg_count-self.volatile_registers))
@ -259,32 +305,32 @@ def gen_get_memslot_offset_amd64_msft_nested(self, slot, arg_count, reg_count):
return gen_get_memslot_offset_amd64_msft(self, slot, arg_count, reg_count, nested=True)
CALLING_CONVENTION_AMD64_MSFT = CallingConvention(
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
gen_prologue_amd64_msft, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
CALLING_CONVENTION_AMD64_MSFT_MUL = CallingConvention(
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 4, 6,
"msft_x64", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_MSFT, 4), 6, 4, 6,
gen_prologue_amd64_msft_mul, gen_epilogue_amd64_msft, gen_get_memslot_offset_amd64_msft)
CALLING_CONVENTION_AMD64_MSFT_NESTED = CallingConvention(
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 4, 7,
"msft_x64", "amd64", MAPPING_AMD64_MSFT, 6, 4, 7,
gen_prologue_amd64_msft_nested, gen_epilogue_amd64_msft_nested, gen_get_memslot_offset_amd64_msft_nested)
# AMD64 System V calling convention
MAPPING_AMD64_SYSTEMV = {
0: REG_RAX, # Result register
1: REG_RDI, # Argument 1 / volatile
2: REG_RSI, # Argument 2 / volatile
3: REG_RDX, # Argument 3 / volatile
4: REG_RCX, # Argument 4 / volatile
5: REG_R8, # Argument 5 / volatile
6: REG_R9, # Argument 6 / volatile
7: REG_R10, # volatile
8: REG_R11, # volatile
9: REG_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
10:REG_RBP,
11:REG_R12,
12:REG_R13,
13:REG_R14,
14:REG_R15
0: AMD64_RAX, # Result register / volatile
1: AMD64_RDI, # Argument 1 / volatile
2: AMD64_RSI, # Argument 2 / volatile
3: AMD64_RDX, # Argument 3 / volatile
4: AMD64_RCX, # Argument 4 / volatile
5: AMD64_R8, # Argument 5 / volatile
6: AMD64_R9, # Argument 6 / volatile
7: AMD64_R10, # volatile
8: AMD64_R11, # volatile
9: AMD64_RBX, # All registers from rbx are non-volatile and need to be saved/restored in epi/prologue
10:AMD64_RBP,
11:AMD64_R12,
12:AMD64_R13,
13:AMD64_R14,
14:AMD64_R15
# currently not mapping rsp
}
@ -305,7 +351,7 @@ def gen_prologue_amd64_systemv(self, arg_count, reg_count, mul_fixup="", nested=
prologue += mul_fixup
# do not support more than 6 (MAX_FUNCTION_ARGUMENT_COUNT) arguments for now
# do not support more than 6 arguments for now
# # put additional arguments into Q7-Qn
# # stack_offset to get the 7th argument is:
# # 8B for return address
@ -341,7 +387,7 @@ def gen_epilogue_amd64_systemv_nested(self, arg_count, reg_count):
def gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=False):
# only support 4 memory slots for now
if(slot >= 4):
logging.error("Symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
logging.error("symcryptasm currently only support 4 memory slots! (requested slot%d)" % slot)
exit(1)
# For leaf functions, use the top of the redzone below the stack pointer
offset = -8 * (slot+1)
@ -354,58 +400,230 @@ def gen_get_memslot_offset_amd64_systemv_nested(self, slot, arg_count, reg_count
return gen_get_memslot_offset_amd64_systemv(self, slot, arg_count, reg_count, nested=True)
CALLING_CONVENTION_AMD64_SYSTEMV = CallingConvention(
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
gen_prologue_amd64_systemv, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
CALLING_CONVENTION_AMD64_SYSTEMV_MUL = CallingConvention(
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 8,
"amd64_systemv", "amd64", get_mul_mapping_from_normal_mapping(MAPPING_AMD64_SYSTEMV, 6), 6, 6, 8,
gen_prologue_amd64_systemv_mul, gen_epilogue_amd64_systemv, gen_get_memslot_offset_amd64_systemv)
CALLING_CONVENTION_AMD64_SYSTEMV_NESTED = CallingConvention(
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 9,
"amd64_systemv", "amd64", MAPPING_AMD64_SYSTEMV, 6, 6, 9,
gen_prologue_amd64_systemv_nested, gen_epilogue_amd64_systemv_nested, gen_get_memslot_offset_amd64_systemv_nested)
def gen_function_start_defines(mapping, arg_count, reg_count):
# ARM64 calling conventions
MAPPING_ARM64_AAPCS64 = {
0: ARM64_R0, # Argument 1 / Result register / volatile
1: ARM64_R1, # Argument 2 / volatile
2: ARM64_R2, # Argument 3 / volatile
3: ARM64_R3, # Argument 4 / volatile
4: ARM64_R4, # Argument 5 / volatile
5: ARM64_R5, # Argument 6 / volatile
6: ARM64_R6, # Argument 7 / volatile
7: ARM64_R7, # Argument 8 / volatile
8: ARM64_R8, # Indirect result location / volatile
9: ARM64_R9, # volatile
10:ARM64_R10, # volatile
11:ARM64_R11, # volatile
12:ARM64_R12, # volatile
13:ARM64_R13, # volatile
14:ARM64_R14, # volatile
15:ARM64_R15, # volatile
# R16 and R17 are intra-procedure-call temporary registers which may be used by the linker
# We cannot use these registers for local scratch if we call out to arbitrary procedures, but
# currently we only have leaf functions in Arm64 symcryptasm.
16:ARM64_R16, # IP0 / volatile
17:ARM64_R17, # IP1 / volatile
# R18 is a platform register which has a special meaning in kernel mode - we do not use it
18:ARM64_R19, # non-volatile
19:ARM64_R20, # non-volatile
20:ARM64_R21, # non-volatile
21:ARM64_R22, # non-volatile
22:ARM64_R23, # non-volatile
# We could map more registers (R24-R28) but we can only support 23 registers for ARM64EC, and we
# don't use this many registers in any symcryptasm yet
}
MAPPING_ARM64_ARM64ECMSFT = {
0: ARM64_R0, # Argument 1 / Result register / volatile
1: ARM64_R1, # Argument 2 / volatile
2: ARM64_R2, # Argument 3 / volatile
3: ARM64_R3, # Argument 4 / volatile
4: ARM64_R4, # Argument 5 / volatile
5: ARM64_R5, # Argument 6 / volatile
6: ARM64_R6, # Argument 7 / volatile
7: ARM64_R7, # Argument 8 / volatile
8: ARM64_R8, # Indirect result location / volatile
9: ARM64_R9, # volatile
10:ARM64_R10, # volatile
11:ARM64_R11, # volatile
12:ARM64_R12, # volatile
# R13 and R14 are reserved in ARM64EC
13:ARM64_R15, # volatile
14:ARM64_R16, # volatile
15:ARM64_R17, # volatile
16:ARM64_R19, # non-volatile
17:ARM64_R20, # non-volatile
18:ARM64_R21, # non-volatile
19:ARM64_R22, # non-volatile
# R23 and R24 are reserved in ARM64EC
20:ARM64_R25, # non-volatile
21:ARM64_R26, # non-volatile
22:ARM64_R27, # non-volatile
# R28 is reserved in ARM64EC
}
def gen_prologue_aapcs64(self, arg_count, reg_count):
prologue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
return prologue
def gen_epilogue_aapcs64(self, arg_count, reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
logging.error("symcryptasm currently does not support spilling registers in leaf functions in aapcs64")
exit(1)
epilogue += " ret\n"
return epilogue
def gen_prologue_arm64ec(self, arg_count, reg_count):
prologue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
prologue += " PROLOG_SAVE_REG_PAIR fp, lr, #-%d! // allocate %d bytes of stack; store FP/LR\n" % (required_stack_space, required_stack_space)
stack_offset = 16
for i in range(self.volatile_registers, reg_count-1, 2):
prologue += " PROLOG_SAVE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
stack_offset += 16
if registers_to_spill % 2 == 1:
prologue += " PROLOG_SAVE_REG X_%d, #%d\n" % (reg_count-1, stack_offset)
return prologue
def gen_epilogue_arm64ec(self, arg_count, reg_count):
epilogue = ""
if reg_count > self.volatile_registers:
# Calculate required stack space
# If we allocate stack space we must spill fp and lr, so we always spill at least 2 registers
registers_to_spill = 2 + reg_count - self.volatile_registers
# Stack pointer remain 16B aligned, so round up to the nearest multiple of 16B
required_stack_space = 16 * ((registers_to_spill + 1) // 2)
stack_offset = required_stack_space-16
if registers_to_spill % 2 == 1:
epilogue += " EPILOG_RESTORE_REG X_%d, #%d\n" % (reg_count-1, stack_offset)
stack_offset -= 16
for i in reversed(range(self.volatile_registers, reg_count-1, 2)):
epilogue += " EPILOG_RESTORE_REG_PAIR X_%d, X_%d, #%d\n" % (i, i+1, stack_offset)
stack_offset -= 16
epilogue += " EPILOG_RESTORE_REG_PAIR fp, lr, #%d! // deallocate %d bytes of stack; restore FP/LR\n" % (required_stack_space, required_stack_space)
epilogue += " EPILOG_RETURN\n"
else:
epilogue += " ret\n"
return epilogue
def gen_get_memslot_offset_arm64(self, slot, arg_count, reg_count, nested=False):
logging.error("symcryptasm currently does not support memory slots for arm64!")
exit(1)
CALLING_CONVENTION_ARM64_AAPCS64 = CallingConvention(
"arm64_aapcs64", "arm64", MAPPING_ARM64_AAPCS64, 8, 8, 18,
gen_prologue_aapcs64, gen_epilogue_aapcs64, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
defines = ""
if architecture == "amd64":
prefix64 = "Q"
prefix32 = "D"
prefix16 = "W"
prefix8 = "B"
elif architecture == "arm64":
prefix64 = "X_"
prefix32 = "W_"
else:
logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
exit(1)
for (index, reg) in mapping.items():
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
continue
defines += "#define Q%s %s\n" % (index, reg.name64)
defines += "#define D%s %s\n" % (index, reg.name32)
defines += "#define W%s %s\n" % (index, reg.name16)
defines += "#define B%s %s\n" % (index, reg.name8)
if start:
if (reg.name64 is not None):
defines += "#define %s%s %s\n" % (prefix64, index, reg.name64)
if (reg.name32 is not None):
defines += "#define %s%s %s\n" % (prefix32, index, reg.name32)
if (reg.name16 is not None):
defines += "#define %s%s %s\n" % (prefix16, index, reg.name16)
if (reg.name8 is not None):
defines += "#define %s%s %s\n" % (prefix8, index, reg.name8)
else:
if (reg.name64 is not None):
defines += "#undef %s%s\n" % (prefix64, index)
if (reg.name32 is not None):
defines += "#undef %s%s\n" % (prefix32, index)
if (reg.name16 is not None):
defines += "#undef %s%s\n" % (prefix16, index)
if (reg.name8 is not None):
defines += "#undef %s%s\n" % (prefix8, index)
return defines
def gen_function_end_defines(mapping, arg_count, reg_count):
undefs = ""
for (index, _) in mapping.items():
if (index != 'H') and (index >= max(arg_count+1, reg_count)):
continue
undefs += "#undef Q%s\n" % (index)
undefs += "#undef D%s\n" % (index)
undefs += "#undef W%s\n" % (index)
undefs += "#undef B%s\n" % (index)
return undefs
def gen_function_start_defines(architecture, mapping, arg_count, reg_count):
return gen_function_defines(architecture, mapping, arg_count, reg_count, start=True)
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s, _TEXT\n"
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s, _TEXT\n"
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s, _TEXT\n"
MASM_FRAME_FUNCTION_END = "NESTED_END %s, _TEXT\n"
def gen_function_end_defines(architecture, mapping, arg_count, reg_count):
return gen_function_defines(architecture, mapping, arg_count, reg_count, start=False)
MASM_FRAMELESS_FUNCTION_ENTRY = "LEAF_ENTRY %s"
MASM_FRAMELESS_FUNCTION_END = "LEAF_END %s"
MASM_FRAME_FUNCTION_ENTRY = "NESTED_ENTRY %s"
MASM_FRAME_FUNCTION_END = "NESTED_END %s"
# MASM function macros takes the text area as an argument
MASM_FUNCTION_TEMPLATE = "%s, _TEXT\n"
# ARMASM64 function macros must be correctly indented
ARMASM64_FUNCTION_TEMPLATE = " %s\n"
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
GAS_FUNCTION_END = ""
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
function_entry = None
if assembler == "masm":
# need to identify and mark up frame functions in masm
if assembler in ["masm", "armasm64"]:
# need to identify and mark up frame functions in masm and armasm64
if nested or (reg_count > calling_convention.volatile_registers):
function_entry = MASM_FRAME_FUNCTION_ENTRY % (function_name)
else:
function_entry = MASM_FRAMELESS_FUNCTION_ENTRY % (function_name)
if assembler == "masm":
function_entry = MASM_FUNCTION_TEMPLATE % function_entry
elif assembler == "armasm64":
function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
elif assembler == "gas":
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
else:
logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
exit(1)
prologue = gen_function_start_defines(calling_convention.mapping, arg_count, reg_count)
prologue = gen_function_start_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)
prologue += "%s" % (function_entry)
prologue += calling_convention.gen_prologue_fn(arg_count, reg_count)
@ -413,31 +631,41 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r
def generate_epilogue(assembler, calling_convention, function_name, arg_count, reg_count, nested):
function_end = None
if assembler == "masm":
if assembler in ["masm", "armasm64"]:
# need to identify and mark up frame functions in masm
if nested or (reg_count > calling_convention.volatile_registers):
function_end = MASM_FRAME_FUNCTION_END % (function_name)
else:
function_end = MASM_FRAMELESS_FUNCTION_END % (function_name)
if assembler == "masm":
function_end = MASM_FUNCTION_TEMPLATE % function_end
elif assembler == "armasm64":
function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
elif assembler == "gas":
function_end = GAS_FUNCTION_END
else:
logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
exit(1)
epilogue = calling_convention.gen_epilogue_fn(arg_count, reg_count)
epilogue += "%s" % (function_end)
epilogue += gen_function_end_defines(calling_convention.mapping, arg_count, reg_count)
epilogue += gen_function_end_defines(calling_convention.architecture, calling_convention.mapping, arg_count, reg_count)
return epilogue
MASM_MACRO_START = "%s MACRO %s\n"
MASM_MACRO_END = "ENDM\n"
ARMASM64_MACRO_START= " MACRO\n %s %s"
ARMASM64_MACRO_END = " MEND\n"
GAS_MACRO_START = ".macro %s %s\n"
GAS_MACRO_END = ".endm\n"
MASM_ALTERNATE_ENTRY= "ALTERNATE_ENTRY %s\n"
GAS_ALTERNATE_ENTRY = "%s: .global %s\n"
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9]+)\s*\)")
FUNCTION_START_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_START\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)")
FUNCTION_END_PATTERN = re.compile("\s*(NESTED_)?(MUL_)?FUNCTION_END\s*\(\s*([a-zA-Z0-9_\(\)]+)\s*\)")
GET_MEMSLOT_PATTERN = re.compile("GET_MEMSLOT_OFFSET\s*\(\s*slot([0-9]+)\s*\)")
ALTERNATE_ENTRY_PATTERN = re.compile("\s*ALTERNATE_ENTRY\s*\(\s*([a-zA-Z0-9]+)\s*\)")
MACRO_START_PATTERN = re.compile("\s*MACRO_START\s*\(\s*([A-Z_0-9]+)\s*,([^\)]+)\)")
@ -499,29 +727,41 @@ class ProcessingStateMachine:
self.arg_count = int(match.groups()[-2])
self.reg_count = int(match.groups()[-1])
if self.is_nested_function and self.nested_calling_convention is None:
logging.error(
"symcryptasm nested functions are not currently supported with assembler (%s) and architecture (%s)!\n\t"
"%s (line %d)"
% (self.assembler, self.normal_calling_convention.architecture, line, line_num))
exit(1)
if self.is_mul_function and self.mul_calling_convention is None:
logging.error(
"symcryptasm mul functions are not supported with assembler (%s) and architecture (%s)!\n\t"
"%s (line %d)"
% (self.assembler, self.normal_calling_convention.architecture, line, line_num))
exit(1)
if self.is_nested_function and self.is_mul_function:
logging.error(
"Too many prefixes for symcryptasm function - currently only 1 of prefix, MUL_ or NESTED_, is supported!\n\t"
"%s (line %d)"
% (line, line_num))
exit(1)
if self.arg_count > MAX_FUNCTION_ARGUMENT_COUNT:
if self.arg_count > self.normal_calling_convention.max_arguments:
logging.error(
"Too many (%d) arguments for symcryptasm function - currently only %d arguments are supported!\n\t"
"Too many (%d) arguments for symcryptasm function - only %d arguments are supported by calling convention (%s)\n\t"
"%s (line %d)"
% (self.arg_count, MAX_FUNCTION_ARGUMENT_COUNT, match.group(0), line_num))
% (self.arg_count, self.normal_calling_convention.max_arguments, self.normal_calling_convention.name, match.group(0), line_num))
exit(1)
if self.reg_count > MAX_FUNCTION_REGISTER_COUNT:
if self.reg_count > len(self.normal_calling_convention.mapping):
logging.error(
"Too many (%d) registers required for symcryptasm function - only %d registers are supported!\n\t"
"Too many (%d) registers required for symcryptasm function - only %d registers are mapped by calling convention (%s)\n\t"
"%s (line %d)"
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT, match.group(0), line_num))
% (self.reg_count, len(self.normal_calling_convention.mapping), self.normal_calling_convention.name, match.group(0), line_num))
exit(1)
if self.is_mul_function and self.reg_count > MAX_FUNCTION_REGISTER_COUNT-1:
if self.is_mul_function and self.reg_count > len(self.mul_calling_convention.mapping)-1:
logging.error(
"Too many (%d) registers required for symcryptasm mul function - only %d registers are supported!\n\t"
"Too many (%d) registers required for symcryptasm mul function - only %d registers are mapped by calling convention (%s)\n\t"
"%s (line %d)"
% (self.reg_count, MAX_FUNCTION_REGISTER_COUNT-1, match.group(0), line_num))
% (self.reg_count, len(self.mul_calling_convention.mapping)-1, self.mul_calling_convention.name, match.group(0), line_num))
exit(1)
logging.info("%d: function start %s, %d, %d" % (line_num, self.function_name, self.arg_count, self.reg_count))
@ -546,10 +786,18 @@ class ProcessingStateMachine:
return MASM_MACRO_START % (self.macro_name, match.group(2))
elif self.assembler == "gas":
return GAS_MACRO_START % (self.macro_name, match.group(2))
elif self.assembler == "armasm64":
# In armasm64 we need to escape all macro arguments with $
prefixed_args = ", $".join(self.macro_args)
if prefixed_args:
prefixed_args = "$" + prefixed_args
return ARMASM64_MACRO_START % (self.macro_name, prefixed_args)
else:
logging.error("Unhandled assembler (%s) in process_start_macro" % assembler)
exit(1)
def process_function_line(self, line, line_num):
# Currently in a function
match = ALTERNATE_ENTRY_PATTERN.match(line)
if (match):
if self.assembler == "masm":
@ -562,12 +810,12 @@ class ProcessingStateMachine:
# Check the end function has same prefix as previous start function
if (self.is_nested_function ^ (match.group(1) == "NESTED_")) or \
(self.is_mul_function ^ (match.group(2) == "MUL_")):
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
logging.error("Function start and end do not have same MUL_ or NESTED_ prefix!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \
% (self.function_start_match.group(0), self.function_start_line, match.group(0), line_num))
exit(1)
# Check the end function pattern has the same label as the previous start function pattern
if self.function_name != match.groups()[-1]:
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)"
logging.error("Function start label does not match Function end label!\n\tStart: %s (line %d)\n\tEnd: %s (line %d)" \
% (self.function_name, self.function_start_line, match.groups()[-1], line_num))
exit(1)
@ -613,8 +861,18 @@ class ProcessingStateMachine:
return MASM_MACRO_END
elif self.assembler == "gas":
return GAS_MACRO_END
elif self.assembler == "armasm64":
return ARMASM64_MACRO_END
else:
logging.error("Unhandled assembler (%s) in process_macro_line" % self.assembler)
exit(1)
if self.assembler == "gas":
if self.assembler == "armasm64":
# In armasm64 macros we need to escape all of the macro arguments with a $ in the macro body
for arg in self.macro_args:
line = re.sub(arg, "$%s" % arg, line)
elif self.assembler == "gas":
# In GAS macros we need to escape all of the macro arguments with a backslash in the macro body
for arg in self.macro_args:
line = re.sub(arg, r"\\%s" % arg, line)
@ -622,18 +880,40 @@ class ProcessingStateMachine:
# Not modifying the line any further
return line
def process_file(target, infilename, outfilename):
assembler = None
if target == "masm":
assembler = "masm"
def process_file(assembler, architecture, calling_convention, infilename, outfilename):
normal_calling_convention = None
if assembler == "masm":
if architecture == "amd64" and calling_convention == "msft":
normal_calling_convention = CALLING_CONVENTION_AMD64_MSFT
mul_calling_convention = CALLING_CONVENTION_AMD64_MSFT_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_MSFT_NESTED
elif target == "gas":
assembler = "gas"
elif assembler == "gas":
if architecture == "amd64" and calling_convention == "systemv":
normal_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV
mul_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_MUL
nested_calling_convention = CALLING_CONVENTION_AMD64_SYSTEMV_NESTED
elif architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
mul_calling_convention = None
nested_calling_convention = None
elif assembler == "armasm64":
if architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
mul_calling_convention = None
nested_calling_convention = None
elif architecture == "arm64" and calling_convention == "arm64ec":
normal_calling_convention = CALLING_CONVENTION_ARM64EC_MSFT
mul_calling_convention = None
nested_calling_convention = None
else:
logging.error("Unhandled assembler (%s) in process_file" % assembler)
exit(1)
if normal_calling_convention is None:
logging.error("Unhandled combination (%s + %s + %s) in process_file"
% (assembler, architecture, calling_convention))
exit(1)
# iterate through file line by line in one pass
file_processing_state = ProcessingStateMachine(
@ -649,9 +929,11 @@ if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
parser.add_argument('target', type=str, help='Target that we want to preprocess for')
parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
parser.add_argument('inputfile', type=str, help='Path to input file')
parser.add_argument('outputfile', type=str, help='Path to output file')
args = parser.parse_args()
process_file(args.target, args.inputfile, args.outputfile)
process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)