Merged PR 9576163: Build SymCrypt with gcc-arm-linux-gnueabihf

This commit is contained in:
Changyu Li 2023-10-13 19:26:47 +00:00
Родитель 17360b237b
Коммит b6a267815e
22 изменённых файлов: 2573 добавлений и 290 удалений

Просмотреть файл

@ -180,4 +180,18 @@ extends:
config: 'Release'
cc: 'clang'
cxx: 'clang++'
additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
- template: .pipelines/templates/build-linux.yml@self
parameters:
arch: 'ARM'
config: 'Debug'
cc: 'gcc'
cxx: 'g++'
additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'
- template: .pipelines/templates/build-linux.yml@self
parameters:
arch: 'ARM'
config: 'Release'
cc: 'gcc'
cxx: 'g++'
additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'

Просмотреть файл

@ -74,6 +74,12 @@ jobs:
apt-get install -y binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu qemu-user
displayName: 'Install arm64 cross-compilation tools'
- ${{ if eq(parameters.arch, 'ARM') }}:
- script: |
apt-get update
apt-get install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf qemu-user
displayName: 'Install arm64 cross-compilation tools'
- task: PipAuthenticate@1
inputs:
artifactFeeds: 'OS/SymCrypt_PublicPackages'
@ -92,23 +98,24 @@ jobs:
- ${{ if ne(parameters.skipTests, true) }}:
- ${{ if ne(parameters.arch, 'ARM64') }}:
- task: PythonScript@0
displayName: 'Run unit tests'
inputs:
scriptSource: 'filePath'
scriptPath: scripts/test.py
arguments: 'bin noperftests'
workingDirectory: $(Build.SourcesDirectory)
- ${{ if ne(parameters.config, 'Sanitize') }}:
- ${{ if ne(parameters.arch, 'ARM') }}:
- task: PythonScript@0
displayName: 'Run dynamic unit tests'
displayName: 'Run unit tests'
inputs:
scriptSource: 'filePath'
scriptPath: scripts/test.py
arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
arguments: 'bin noperftests'
workingDirectory: $(Build.SourcesDirectory)
- ${{ if ne(parameters.config, 'Sanitize') }}:
- task: PythonScript@0
displayName: 'Run dynamic unit tests'
inputs:
scriptSource: 'filePath'
scriptPath: scripts/test.py
arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
workingDirectory: $(Build.SourcesDirectory)
- ${{ if eq(parameters.arch, 'AMD64') }}:
- task: PythonScript@0
displayName: 'Run unit tests (test YMM save/restore)'
@ -134,6 +141,23 @@ jobs:
scriptPath: scripts/test.py
arguments: '--emulator qemu-aarch64 --emulator-lib-dir /usr/aarch64-linux-gnu/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
workingDirectory: $(Build.SourcesDirectory)
- ${{ if eq(parameters.arch, 'ARM') }}:
- task: PythonScript@0
displayName: 'Run unit tests'
inputs:
scriptSource: 'filePath'
scriptPath: scripts/test.py
arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin noperftests +symcrypt -dh -dsa -rsa'
workingDirectory: $(Build.SourcesDirectory)
- task: PythonScript@0
displayName: 'Run dynamic unit tests'
inputs:
scriptSource: 'filePath'
scriptPath: scripts/test.py
arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
workingDirectory: $(Build.SourcesDirectory)
- task: PythonScript@0
displayName: 'Package build output'

Просмотреть файл

@ -79,4 +79,25 @@ Building the SymCrypt unit tests with MSBuild requires access to the msbignum an
be released externally due to licensing restrictions. If you wish to build directly with MSBuild, bypassing the Python
helper script, you can run `msbuild /p:Platform=<platform> /p:Architecture=<arch> symcrypt.sln`. Note that Python is
still required for translating SymCryptAsm. The output directory for MSBuild is always `build\bin`, and all compiled
outputs are placed in this directory.
outputs are placed in this directory.
## Building Linux targets
Requires the following packages on debian-based systems to build:
```
apt-get -y install --no-install-recommends \
cmake \
python3-pyelftools \ # integrity
gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm
```
And for running the test:
```
apt-get -y install --no-install-recommends qemu-user
```
To build and test for example for arm:
```
python3 scripts/build.py cmake --arch arm --toolchain cmake-configs/Toolchain-GCC-ARM.cmake bin_arm
qemu-arm -L /usr/arm-linux-gnueabihf/ ./bin_arm/exe/symcryptunittest -rsa -dsa -dh -ec -int -mod dynamic:bin_arm/module/generic/libsymcrypt.so
```

Просмотреть файл

@ -52,6 +52,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
# GCC complains about implicit casting between ASIMD registers (i.e. uint8x16_t -> uint64x2_t) by default,
# whereas clang and MSVC do not. Setting -flax-vector-conversions to build Arm64 intrinsics code with GCC.
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a+neon-vfpv4 -flax-vector-conversions -mfpu=neon")
endif()
# add_compile_options(-Wall)

Просмотреть файл

@ -0,0 +1,22 @@
# This toolchain file configures ARM cross-compilation with GCC.
# To use the toolchain file, run cmake .. --toolchain="../cmake-configs/Toolchain-GCC-ARM.cmake"
# Note: the --toolchain argument is only available in CMake v3.21 and newer. Prior to that version, you will have to
# specify -DCMAKE_TOOLCHAIN_FILE instead, which may cause a spurious warning about an unused variable.
set(CMAKE_SYSTEM_PROCESSOR ARM)
set(TARGET_TRIPLE arm-linux-gnueabihf)
set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM|arm")
# C/C++ toolchain (installed on Ubuntu using apt-get gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf)
set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})
find_path(CXX_CROSS_INCLUDE_DIR NAMES ${TARGET_TRIPLE} PATHS /usr/${TARGET_TRIPLE}/include/ /usr/${TARGET_TRIPLE}/include/c++/ PATH_SUFFIXES 15 14 13 12 11 10 9 8 7 6 5 NO_DEFAULT_PATH)
add_compile_options(-I${CXX_CROSS_INCLUDE_DIR}/${TARGET_TRIPLE})
endif()

Просмотреть файл

@ -1568,7 +1568,12 @@ typedef union _SYMCRYPT_GCM_SUPPORTED_BLOCKCIPHER_KEYS
#if SYMCRYPT_CPU_ARM
#include <arm_neon.h>
#include <arm_neon.h>
#if SYMCRYPT_GNUC
#define __n128 uint32x4_t
#define __n64 uint64x1_t
#endif
#elif SYMCRYPT_CPU_ARM64
#if SYMCRYPT_MS_VC

Просмотреть файл

@ -125,10 +125,12 @@ function(process_cppasm filepath outformat archdefine)
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
endif()
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
string(TOUPPER ${outformat} outformatupper)
string(TOUPPER ${archdefine} archdefineupper)
@ -179,10 +181,10 @@ function(process_symcryptasm filepath outformat archdefine callingconvention)
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
endif()
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
endif()
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec) AND (NOT callingconvention STREQUAL aapcs32))
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
endif()
get_filename_component(rootpath ${filepath} DIRECTORY)
@ -229,8 +231,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
amd64/fdef369_asm-masm.asm
amd64/fdef_mulx-masm.asm
amd64/wipe-masm.asm
amd64/sha256xmm_asm-masm.asm
amd64/sha256ymm_asm-masm.asm
amd64/sha256xmm_asm-masm.asm
amd64/sha256ymm_asm-masm.asm
amd64/sha512ymm_asm-masm.asm
amd64/sha512ymm_avx512vl_asm-masm.asm)
set_source_files_properties(
@ -239,8 +241,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
amd64/fdef369_asm-masm.asm
amd64/fdef_mulx-masm.asm
amd64/wipe-masm.asm
amd64/sha256xmm_asm-masm.asm
amd64/sha256ymm_asm-masm.asm
amd64/sha256xmm_asm-masm.asm
amd64/sha256ymm_asm-masm.asm
amd64/sha512ymm_asm-masm.asm
amd64/sha512ymm_avx512vl_asm-masm.asm
PROPERTY LANGUAGE ASM_MASM)
@ -294,7 +296,7 @@ elseif(SYMCRYPT_USE_ASM) # Linux
amd64/sha512ymm_asm-gas.asm
amd64/sha512ymm_avx512vl_asm-gas.asm
PROPERTY LANGUAGE ASM)
elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM64")
elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM64")
process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
@ -308,6 +310,22 @@ elseif(SYMCRYPT_USE_ASM) # Linux
arm64/fdef369_asm-gas.asm
arm64/wipe-gas.asm
PROPERTY LANGUAGE ASM)
elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM")
process_symcryptasm(arm/aesasm.symcryptasm gas arm aapcs32)
process_symcryptasm(arm/fdef_asm.symcryptasm gas arm aapcs32)
process_symcryptasm(arm/wipe.symcryptasm gas arm aapcs32)
list(APPEND SOURCES_COMMON
arm/aesasm-gas.asm
arm/fdef_asm-gas.asm
arm/wipe-gas.asm)
set_source_files_properties(
arm/aesasm-gas.asm
arm/fdef_asm-gas.asm
arm/wipe-gas.asm
PROPERTY LANGUAGE ASM)
set_source_files_properties(
arm/fdef_asm-gas.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/arm)
endif()
endif()

977
lib/arm/aesasm.symcryptasm Normal file
Просмотреть файл

@ -0,0 +1,977 @@
//
// AesAsm.cppasm Assembler code for fast AES on ARM
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
// This code is derived from the AMD64 version of the AesFast
// implementation, developed by Niels Ferguson. For questions
// about the ARM specifics, contact Aaron Giles.
//
// #include "kxarm.h"
#include "symcryptasm_shared.cppasm"
#if SYMCRYPT_DEBUG
SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API * 65536) + SYMCRYPT_CODE_VERSION_MINOR))
SET(SYMCRYPT_MAGIC_CONSTANT, HEX(53316D76) + SYMCRYPT_CODE_VERSION) // 0x53316D76 == 'S1mv'
MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
ldr temp1, [ptr, #offset]
subs temp1, temp1, ptr
//mov32 temp2, =SYMCRYPT_MAGIC_CONSTANT
ldr temp2, =SYMCRYPT_MAGIC_CONSTANT
cmp temp1, temp2
beq check_magic_label
//mov32 r0, 0x6d616763 // 'magc'
ldr r0, =0x6d616763 // 'magc'
bl SymCryptFatal
check_magic_label:
MACRO_END()
#else
MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
MACRO_END()
#endif
// TTL "Advanced Encryption Standard (AES)"
//
// typedef SYMCRYPT_ALIGN_STRUCT _SYMCRYPT_AES_EXPANDED_KEY {
// SYMCRYPT_ALIGN BYTE RoundKey[29][4][4];
// Round keys, first the encryption round keys in encryption order,
// followed by the decryption round keys in decryption order.
// The first decryption round key is the last encryption round key.
// AES-256 has 14 rounds and thus 15 round keys for encryption and 15
// for decryption. As they share one round key, we need room for 29.
// BYTE (*lastEncRoundKey)[4][4]; Pointer to last encryption round key
// also the first round key for decryption
// BYTE (*lastDecRoundKey)[4][4]; Pointer to last decryption round key.
//
// SYMCRYPT_MAGIC_FIELD
// } SYMCRYPT_AES_EXPANDED_KEY, *PSYMCRYPT_AES_EXPANDED_KEY
//
#define SYMCRYPT_AES_EXPANDED_KEY_RoundKey (0)
#define SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey (29*4*4)
#define SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey (29*4*4+4)
#if SYMCRYPT_DEBUG
#define SYMCRYPT_AES_EXPANDED_KEY_magic (29*4*4+4+4)
#endif
MACRO_START(ENC_MIX, keyptr)
//
// Perform the unkeyed mixing function for encryption
// plus a key addition from the key pointer
//
// Input:
// r0,r1,r2,r3 = current block
// keyptr = pointer to current key
// r12 = pointer to AesSboxMatrixMult
//
// Output:
// r0,r1,r2,r3 = updated block
// keyptr = updated to point to following key
// r12 = unmodified
//
// Used:
// r4,r5,r6,r7,lr are modified
//
// N.B. To make better use of ARM's barrel shifter, this code differs
// from the AMD64 approach. The first lookups are not rotated;
// instead all subsequent lookups are applied on top rotated, and
// then a final rotation is performed to shift the bits into the
// proper spot.
//
uxtb r4, r0 // extract individual bytes from r0
uxtb r7, r0, ror #8 //
uxtb r6, r0, ror #16 //
uxtb r5, r0, ror #24 //
ldr r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
ldr r7, [r12, r7, lsl #2] // the values unrotated for now
ldr r6, [r12, r6, lsl #2] //
ldr r5, [r12, r5, lsl #2] //
uxtb r0, r1 // extract individual bytes from r1
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr lr, [r12, lr, lsl #2] //
eor r5, r5, r0, ror #24 // exclusive-OR with previous
eor r4, r4, lr, ror #24 //
uxtb r0, r1, ror #16 // extract remaining bytes from r1
uxtb r1, r1, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
eor r7, r7, r0, ror #24 // exclusive-OR with previous
eor r6, r6, r1, ror #24 //
uxtb r0, r2 // extract individual bytes from r2
uxtb r1, r2, ror #8 //
uxtb lr, r2, ror #16 //
uxtb r2, r2, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
ldr lr, [r12, lr, lsl #2] //
ldr r2, [r12, r2, lsl #2] //
eor r6, r6, r0, ror #16 // exclusive-OR with previous
eor r5, r5, r1, ror #16 //
eor r4, r4, lr, ror #16 //
eor r7, r7, r2, ror #16 //
uxtb r0, r3 // extract individual bytes from r3
uxtb r1, r3, ror #8 //
uxtb r2, r3, ror #16 //
uxtb r3, r3, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
ldr r2, [r12, r2, lsl #2] //
ldr r3, [r12, r3, lsl #2] //
eor r7, r7, r0, ror #8 // exclusive-OR with previous
eor r6, r6, r1, ror #8 //
eor r5, r5, r2, ror #8 //
eor r4, r4, r3, ror #8 //
ldrd r0, r1, [keyptr, #0] // fetch key into r0-r3
ldrd r2, r3, [keyptr, #8] //
adds keyptr, keyptr, #16 // increment key pointer
eors r0, r0, r4 // exclusive-OR the key and rotate into final
eor r1, r1, r5, ror #8 // position
eor r2, r2, r6, ror #16 //
eor r3, r3, r7, ror #24 //
MACRO_END()
MACRO_START(DEC_MIX, keyptr)
//
// Perform the unkeyed mixing function for decryption
//
// Input:
// r0,r1,r2,r3 = current block
// keyptr = pointer to current key
// r12 = pointer to AesInvSboxMatrixMult
//
// Output:
// r0,r1,r2,r3 = updated block
// keyptr = updated to point to following key
// r12 = unmodified
//
// Used:
// r4,r5,r6,r7,lr are modified
//
// N.B. To make better use of ARM's barrel shifter, this code differs
// from the AMD64 approach. The first lookups are not rotated;
// instead all subsequent lookups are applied on top rotated, and
// then a final rotation is performed to shift the bits into the
// proper spot.
//
uxtb r4, r0 // extract individual bytes from r0
uxtb r5, r0, ror #8 //
uxtb r6, r0, ror #16 //
uxtb r7, r0, ror #24 //
ldr r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
ldr r5, [r12, r5, lsl #2] // the values unrotated for now
ldr r6, [r12, r6, lsl #2] //
ldr r7, [r12, r7, lsl #2] //
uxtb r0, r1 // extract individual bytes from r1
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr lr, [r12, lr, lsl #2] //
eor r5, r5, r0, ror #8 // exclusive-OR with previous
eor r6, r6, lr, ror #8 //
uxtb r0, r1, ror #16 // extract remaining bytes from r1
uxtb r1, r1, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
eor r7, r7, r0, ror #8 // exclusive-OR with previous
eor r4, r4, r1, ror #8 //
uxtb r0, r2 // extract individual bytes from r2
uxtb r1, r2, ror #8 //
uxtb lr, r2, ror #16 //
uxtb r2, r2, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
ldr lr, [r12, lr, lsl #2] //
ldr r2, [r12, r2, lsl #2] //
eor r6, r6, r0, ror #16 // exclusive-OR with previous
eor r7, r7, r1, ror #16 //
eor r4, r4, lr, ror #16 //
eor r5, r5, r2, ror #16 //
uxtb r0, r3 // extract individual bytes from r3
uxtb r1, r3, ror #8 //
uxtb r2, r3, ror #16 //
uxtb r3, r3, ror #24 //
ldr r0, [r12, r0, lsl #2] // perform lookups
ldr r1, [r12, r1, lsl #2] //
ldr r2, [r12, r2, lsl #2] //
ldr r3, [r12, r3, lsl #2] //
eor r7, r7, r0, ror #24 // exclusive-OR with previous
eor r4, r4, r1, ror #24 //
eor r5, r5, r2, ror #24 //
eor r6, r6, r3, ror #24 //
ldrd r0, r1, [keyptr, #0] // fetch key into r0-r3
ldrd r2, r3, [keyptr, #8] //
adds keyptr, keyptr, #16 // increment key pointer
eors r0, r0, r4 // exclusive-OR the key and rotate into final
eor r1, r1, r5, ror #24 // position
eor r2, r2, r6, ror #16 //
eor r3, r3, r7, ror #8 //
MACRO_END()
MACRO_START(AES_ENCRYPT, loopLabel)
//
// Input:
// r0,r1,r2,r3 = plaintext
// r8 = pointer to first round key to use
// r9 = pointer to last key to use
// r12 = pointer to AesSboxMatrixMult
//
// Output:
// r4,r5,r6,r7 = ciphertext
// r8 = modified to point to last key
// r9 = unmodified
// r12 = unmodified
//
// Used:
// lr is also modified
//
//
// xor in first round key
//
ldrd r4, r5, [r8, #0] // fetch key in r4-r7
ldrd r6, r7, [r8, #8] //
eors r0, r0, r4 // exclusive-OR with the plaintext
eors r1, r1, r5 //
eors r2, r2, r6 //
eors r3, r3, r7 //
add r8, r8, #16 // point to second key
loopLabel:
//
// Block is r0,r1,r2,r3
// r8 points to current round key
//
ENC_MIX r8 // encrypt the block and increment key
cmp r8, r9 // are we at the end?
blo loopLabel // loop until it is so
//
// Now for the final round
// We use the fact that SboxMatrixMult[0] table is also
// an Sbox table if you use the second element of each entry.
//
// Result is in r4,r5,r6,r7
//
add r12, r12, #1 // advance by 1 to point to second element
uxtb r4, r0 // extract individual bytes from r0
uxtb r7, r0, ror #8 //
uxtb r6, r0, ror #16 //
uxtb r5, r0, ror #24 //
ldrb r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
ldrb r7, [r12, r7, lsl #2] // the values unrotated for now
ldrb r6, [r12, r6, lsl #2] //
ldrb r5, [r12, r5, lsl #2] //
uxtb r0, r1 // extract individual bytes from r1
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
ldrb r0, [r12, r0, lsl #2] // perform lookups
ldrb lr, [r12, lr, lsl #2] //
orr r5, r5, r0, lsl #8 // merge with previous
orr r4, r4, lr, lsl #8 //
uxtb r0, r1, ror #16 // extract remaining bytes from r1
uxtb r1, r1, ror #24 //
ldrb r0, [r12, r0, lsl #2] // perform lookups
ldrb r1, [r12, r1, lsl #2] //
orr r7, r7, r0, lsl #8 // merge with previous
orr r6, r6, r1, lsl #8 //
uxtb r0, r2 // extract individual bytes from r2
uxtb r1, r2, ror #8 //
uxtb lr, r2, ror #16 //
uxtb r2, r2, ror #24 //
ldrb r0, [r12, r0, lsl #2] // perform lookups
ldrb r1, [r12, r1, lsl #2] //
ldrb lr, [r12, lr, lsl #2] //
ldrb r2, [r12, r2, lsl #2] //
orr r6, r6, r0, lsl #16 // merge with previous
orr r5, r5, r1, lsl #16 //
orr r4, r4, lr, lsl #16 //
orr r7, r7, r2, lsl #16 //
uxtb r0, r3 // extract individual bytes from r3
uxtb r1, r3, ror #8 //
uxtb r2, r3, ror #16 //
uxtb r3, r3, ror #24 //
ldrb r0, [r12, r0, lsl #2] // perform lookups
ldrb r1, [r12, r1, lsl #2] //
ldrb r2, [r12, r2, lsl #2] //
ldrb r3, [r12, r3, lsl #2] //
orr r7, r7, r0, lsl #24 // merge with previous
orr r6, r6, r1, lsl #24 //
orr r5, r5, r2, lsl #24 //
orr r4, r4, r3, lsl #24 //
sub r12, r12, #1 // put r12 back to its original value
//
// xor in final round key
//
ldrd r0, r1, [r9, #0] // fetch key into r0-r3
ldrd r2, r3, [r9, #8] //
eors r4, r4, r0 // exclusive-OR the key and rotate into final
eor r5, r1, r5, ror #8 // position
eor r6, r2, r6, ror #16 //
eor r7, r3, r7, ror #24 //
MACRO_END()
MACRO_START(AES_DECRYPT, loopLabel)
//
// Input:
// r0,r1,r2,r3 = ciphertext
// r8 = pointer to first round key to use
// r9 = pointer to last key to use
// r10 = pointer to InvSbox
// r12 = pointer to InvSboxMatrixMult
//
// Output:
// r4,r5,r6,r7 = plaintext
// r8 = modified to point to last key
// r9 = unmodified
// r10 = unmodified
// r12 = unmodified
//
// Used:
// lr is also modified
//
//
// xor in first round key
//
ldrd r4, r5, [r8, #0] // fetch key in r4-r7
ldrd r6, r7, [r8, #8] //
eors r0, r0, r4 // exclusive-OR with the plaintext
eors r1, r1, r5 //
eors r2, r2, r6 //
eors r3, r3, r7 //
add r8, r8, #16 // point to second key
loopLabel:
//
// Block is r0, r1, r2, r3
// r8 points to current round key
//
DEC_MIX r8 // decrypt the block and increment key
cmp r8, r9 // are we at the end?
blo loopLabel // loop until it is so
//
// Now for the final round
// Result is in r4, r5, r6, r7
//
uxtb r4, r0 // extract individual bytes from r0
uxtb r5, r0, ror #8 //
uxtb r6, r0, ror #16 //
uxtb r7, r0, ror #24 //
ldrb r4, [r10, r4] // perform lookups of each byte, leaving
ldrb r5, [r10, r5] // the values unrotated for now
ldrb r6, [r10, r6] //
ldrb r7, [r10, r7] //
uxtb r0, r1 // extract individual bytes from r1
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
ldrb r0, [r10, r0] // perform lookups
ldrb lr, [r10, lr] //
orr r5, r5, r0, lsl #24 // merge with previous
orr r6, r6, lr, lsl #24 //
uxtb r0, r1, ror #16 // extract remaining bytes from r1
uxtb r1, r1, ror #24 //
ldrb r0, [r10, r0] // perform lookups
ldrb r1, [r10, r1] //
orr r7, r7, r0, lsl #24 // merge with previous
orr r4, r4, r1, lsl #24 //
uxtb r0, r2 // extract individual bytes from r2
uxtb r1, r2, ror #8 //
uxtb lr, r2, ror #16 //
uxtb r2, r2, ror #24 //
ldrb r0, [r10, r0] // perform lookups
ldrb r1, [r10, r1] //
ldrb lr, [r10, lr] //
ldrb r2, [r10, r2] //
orrs r6, r6, r0, lsl #16 // merge with previous
orr r7, r7, r1, lsl #16 //
orr r4, r4, lr, lsl #16 //
orr r5, r5, r2, lsl #16 //
uxtb r0, r3 // extract individual bytes from r3
uxtb r1, r3, ror #8 //
uxtb r2, r3, ror #16 //
uxtb r3, r3, ror #24 //
ldrb r0, [r10, r0] // perform lookups
ldrb r1, [r10, r1] //
ldrb r2, [r10, r2] //
ldrb r3, [r10, r3] //
orr r7, r7, r0, lsl #8 // merge with previous
orr r4, r4, r1, lsl #8 //
orr r5, r5, r2, lsl #8 //
orr r6, r6, r3, lsl #8 //
//
// xor in final round key
//
ldrd r0, r1, [r9, #0] // fetch key into r0-r3
ldrd r2, r3, [r9, #8] //
eors r4, r4, r0 // exclusive-OR the key and rotate into final
eor r5, r1, r5, ror #24 // position
eor r6, r2, r6, ror #16 //
eor r7, r3, r7, ror #8 //
MACRO_END()
//
// VOID
// SYMCRYPT_CALL
// SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext,
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext )
//
// NESTED_ENTRY SymCryptAesEncryptAsm
FUNCTION_START(SymCryptAesEncryptAsm, 3, 12)
//
// Input parameters:
// r0 = pExpandedKey
// r1 = pbPlaintext
// r2 = pbCiphertext
//
// push {r2, r4-r11, lr}
push {r2}
//
// Stack layout:
// [sp] = r2 = pbCipherText
//
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesEncryptAsm_check_magic_label
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last key
mov r8, r0 // r8 = first key
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = matrix mult table
ldr r12, =SymCryptAesSboxMatrixMult // r12 = matrix mult table
ldr r0, [r1, #0] // load the plaintext
ldr r2, [r1, #8] //
ldr r3, [r1, #12] //
ldr r1, [r1, #4] //
AES_ENCRYPT SymCryptAesEncryptAsm_loopLabel
//
// Plaintext in r0, r1, r2, r3
// r8 points to first round key to use
// r9 is last key to use (unchanged)
// r12 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in r4, r5, r6, r7
//
ldr r0, [sp] // recover pbCipherText
str r4, [r0, #0] // store the encrypted data
str r5, [r0, #4] //
str r6, [r0, #8] //
str r7, [r0, #12] //
// pop {r2, r4-r11, pc} // return
pop {r2}
// NESTED_END SymCryptAesEncryptAsm
FUNCTION_END(SymCryptAesEncryptAsm)
//
// VOID
// SYMCRYPT_CALL
// SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext,
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext )
//
// NESTED_ENTRY SymCryptAesDecryptAsm
FUNCTION_START(SymCryptAesDecryptAsm, 3, 12)
//
// Input parameters:
// r0 = pExpandedKey
// r1 = pbCiphertext
// r2 = pbPlaintext
//
// push {r2, r4-r11, lr}
push {r2}
//
// Stack layout:
// [sp] = r2 = pbPlaintext
//
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesDecryptAsm_check_magic_label
ldr r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r8 = first key
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last key
// mov32 r10, SymCryptAesInvSbox // r10 = inverse sbox table
// mov32 r12, SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
ldr r10, =SymCryptAesInvSbox // r10 = inverse sbox table
ldr r12, =SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
ldr r0, [r1, #0] // load the ciphertext
ldr r2, [r1, #8] //
ldr r3, [r1, #12] //
ldr r1, [r1, #4] //
AES_DECRYPT SymCryptAesDecryptAsm_loopLabel
//
// Ciphertext in r0, r1, r2, r3
// r8 points to first round key to use
// r9 is last key to use (unchanged)
// r10 points to InvSbox (unchanged)
// r12 points to InvSboxMatrixMult (unchanged)
// Ciphertext ends up in r4, r5, r6, r7
//
ldr r0, [sp] // recover pbPlaintext
str r4, [r0, #0] // store the decrypted data
str r5, [r0, #4] //
str r6, [r0, #8] //
str r7, [r0, #12] //
pop {r2}
// pop {r2, r4-r11, pc} // return
// NESTED_END SymCryptAesDecryptAsm
FUNCTION_END(SymCryptAesDecryptAsm)
//
// VOID
// SYMCRYPT_CALL
// SymCryptAesCbcEncrypt(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
// NESTED_ENTRY SymCryptAesCbcEncryptAsm
FUNCTION_START(SymCryptAesCbcEncryptAsm, 4, 12)
//
// Input parameters:
// r0 = pExpandedKey
// r1 = pbChainingValue
// r2 = pbSrc
// r3 = pbDst
// [sp] = cbData
//
// push {r0-r2, r4-r11, lr}
push {r0-r2}
sub sp, sp, #16
//
// Stack layout:
// [sp] = pbSrc
// [sp+4] = pbSrcEnd
// [sp+16] = r0 = pbExpandedKey
// [sp+20] = r1 = pbChainingValue
// [sp+24] = r2 = pbSrc
// [sp+64] = cbData
//
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcEncryptAsm_check_magic_label
pld [r2] // prefetch source data
ldr r4, [sp, #64] // r4 = cbData
mov r8, r2 // r8 = pbSrc on loop entry
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
mov r10, r3 // r10 = pbDst
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
ldr r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
bics r4, r4, #15 // r4 &= ~15
beq SymCryptAesCbcEncryptNoData // skip if no data
adds r4, r4, r2 // r4 = pbSrc + cbData
// strd r2, r4, [sp] // save pbSrc/pbSrcEnd at [sp]
str r2, [sp]
str r4, [sp, #4]
pld [r8, #32] // prefetch source data
ldr r4, [r1, #0] // load chaining state from pbChainingValue
ldr r5, [r1, #4] //
ldr r6, [r1, #8] //
ldr r7, [r1, #12] //
SymCryptAesCbcEncryptAsmLoop:
//
// Loop register setup
// r4,r5,r6,r7 = chaining state
// r8 = pbSrc
// r9 = last round key to use
// r10 = pbDst
// r12 = SboxMatrixMult
//
ldr r0, [r8] // read next 16 bytes of plaintext
ldr r1, [r8, #4] //
ldr r2, [r8, #8] //
ldr r3, [r8, #12] //
pld [r8, #64] // prefetch source data
add r8, r8, #16 // pbSrc += 16
str r8, [sp] // save it
eors r0, r0, r4 // exclusive-OR against chaining value
eors r1, r1, r5 //
eors r2, r2, r6 //
eors r3, r3, r7 //
ldr r8, [sp, #16] // r8 = first round key
AES_ENCRYPT SymCryptAesCbcEncryptAsm_loopLabel
//
// Plaintext in r0, r1, r2, r3
// r8 points to first round key to use
// r9 is last key to use (unchanged)
// r12 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in r4, r5, r6, r7
//
// ldrd r8, r0, [sp] // fetch pbSrc/pbSrcEnd
ldr r8, [sp]
ldr r0, [sp, #4]
str r4, [r10, #0] // write ciphertext
str r5, [r10, #4] //
str r6, [r10, #8] //
str r7, [r10, #12] //
add r10, r10, #16 // pbDst += 16
cmp r8, r0 // are we at the end of source?
blo SymCryptAesCbcEncryptAsmLoop // loop until we are
ldr r0, [sp, #20] // r0 = pbChainingValue
str r4, [r0, #0] // update the chaining value
str r5, [r0, #4] //
str r6, [r0, #8] //
str r7, [r0, #12] //
SymCryptAesCbcEncryptNoData:
add sp, sp, #16
pop {r0-r2}
// pop {r0-r2, r4-r11, pc} // return
// NESTED_END SymCryptAesCbcEncryptAsm
FUNCTION_END(SymCryptAesCbcEncryptAsm)
//
// VOID
// SYMCRYPT_CALL
// SymCryptAesCbcDecrypt(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
// NESTED_ENTRY SymCryptAesCbcDecryptAsm
FUNCTION_START(SymCryptAesCbcDecryptAsm, 4, 12)
//
// Input parameters:
// r0 = pExpandedKey
// r1 = pbChainingValue
// r2 = pbSrc
// r3 = pbDst
// [sp] = cbData
//
// push {r0-r2, r4-r11, lr}
push {r0-r2}
sub sp, sp, #32
//
// Stack layout:
// [sp] = pbSrc
// [sp+4] = pbDst
// [sp+8] = pbSrcEnd
// [sp+16] = saved chaining value
// [sp+32] = r0 = pbExpandedKey
// [sp+36] = r1 = pbChainingValue
// [sp+40] = r2 = pbSrc
// [sp+80] = cbData
//
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcDecryptAsm_check_magic_label
ldr r4, [sp, #80] // r4 = cbData
bics r4, r4, #15 // r4 &= ~15
beq SymCryptAesCbcDecryptNoData // skip if no data
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last enc round key (invariant)
ldr r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey]
subs r4, r4, #16
adds r3, r3, r4
adds r2, r2, r4
pld [r2] // prefetch source data
str r3, [sp, #4]
str r2, [sp, #0]
str r8, [sp, #8]
// mov32 r10, SymCryptAesInvSbox
// mov32 r12, SymCryptAesInvSboxMatrixMult
ldr r10, =SymCryptAesInvSbox
ldr r12, =SymCryptAesInvSboxMatrixMult
//
// Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
//
pld [r2, #-32] // prefetch source data
ldr r0, [r2, #0]
ldr r1, [r2, #4]
ldr r3, [r2, #12]
ldr r2, [r2, #8]
strd r0, r1, [sp, #16]
strd r2, r3, [sp, #24]
b SymCryptAesCbcDecryptAsmLoopEntry
SymCryptAesCbcDecryptAsmLoop:
// Loop register setup
// r13 = first round key to use
// r14 = pbSrc
// r15 = pbDst
// [callerP3Home] = pbSrcStart
// current ciphertext block (esi,edi,ebp,r8d)
ldr r0, [r8, #-16]
ldr r1, [r8, #-12]
ldr r2, [r8, #-8]
ldr r3, [r8, #-4]
pld [r8, #-64] // prefetch source data
eors r4, r4, r0
eors r5, r5, r1
eors r6, r6, r2
eors r7, r7, r3
str r4, [lr, #0]
str r5, [lr, #4]
str r6, [lr, #8]
str r7, [lr, #12]
sub lr, lr, #16
sub r8, r8, #16
str r8, [sp]
str lr, [sp, #4]
SymCryptAesCbcDecryptAsmLoopEntry:
ldr r8, [sp, #8]
AES_DECRYPT SymCryptAesCbcDecryptAsm_loopLabel
// ldrd r8, lr, [sp, #0]
ldr r8, [sp, #0]
ldr lr, [sp, #4]
ldr r0, [sp, #40]
cmp r8, r0
bhi SymCryptAesCbcDecryptAsmLoop
ldr r8, [sp, #36] // r8 = pbChainingValue
ldr r0, [r8, #0]
ldr r1, [r8, #4]
ldr r2, [r8, #8]
ldr r3, [r8, #12]
eors r4, r4, r0
eors r5, r5, r1
eors r6, r6, r2
eors r7, r7, r3
str r4, [lr, #0]
str r5, [lr, #4]
str r6, [lr, #8]
str r7, [lr, #12]
//
// Update the chaining value to the last ciphertext block
//
ldrd r0, r1, [sp, #16]
ldrd r2, r3, [sp, #24]
str r0, [r8, #0]
str r1, [r8, #4]
str r2, [r8, #8]
str r3, [r8, #12]
SymCryptAesCbcDecryptNoData:
add sp, sp, #32
pop {r0-r2}
// pop {r0-r2, r4-r11, pc}
// NESTED_END SymCryptAesCbcDecryptAsm
FUNCTION_END(SymCryptAesCbcDecryptAsm)
//
// VOID
// SYMCRYPT_CALL
// SymCryptAesCtrMsb64(
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
// SIZE_T cbData )
// NESTED_ENTRY SymCryptAesCtrMsb64Asm
FUNCTION_START(SymCryptAesCtrMsb64Asm, 4, 12):
//
// Input parameters:
// r0 = pExpandedKey
// r1 = pbChainingValue
// r2 = pbSrc
// r3 = pbDst
// [sp] = cbData
//
// push {r0-r2, r4-r11, lr}
push {r0-r2}
sub sp, sp, #32
//
// Stack layout:
// [sp] = pbDst
// [sp+4] = pbSrcEnd
// [sp+16] = local copy of chaining data
// [sp+32] = r0 = pbExpandedKey
// [sp+36] = r1 = pbChainingValue
// [sp+40] = r2 = pbSrc
// [sp+80] = cbData
//
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCtrMsb64Asm_check_magic_label
pld [r2] // prefetch source data
ldr r4, [sp, #80] // r4 = cbData
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
mov r10, r2 // r10 = pbSrc
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
ldr r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
bics r4, r4, #15 // r4 &= ~15
beq SymCryptAesCtrMsb64NoData // skip if no data
adds r4, r4, r2 // r4 = pbSrc + cbData
// strd r3, r4, [sp] // save pbDst/pbSrcEnd at [sp]
str r3, [sp] // save pbDst at [sp]
str r4, [sp, #4] // save pbSrcEnd at [sp, #4]
pld [r10, #32] // prefetch source data
mov r3, r1 // load chaining state from pbChainingValue
ldr r0, [r3, #0] //
ldr r1, [r3, #4] //
ldr r2, [r3, #8] //
ldr r3, [r3, #12] //
strd r0, r1, [sp, #16] // save a local copy
strd r2, r3, [sp, #24] //
SymCryptAesCtrMsb64AsmLoop:
//
// Loop register setup
// r0,r1,r2,r3 = chaining state
// r8 = pbSrc
// r9 = last round key to use
// r10 = pbSrc
// r12 = SboxMatrixMult
//
ldr r8, [sp, #32] // r8 = first round key
AES_ENCRYPT loopLabel
//
// Plaintext in r0, r1, r2, r3
// r8 points to first round key to use
// r9 is last key to use (unchanged)
// r12 points to SboxMatrixMult (unchanged)
// Ciphertext ends up in r4, r5, r6, r7
//
ldr r0, [r10, #0] // load plaintext
ldr r1, [r10, #4] //
ldr r2, [r10, #8] //
ldr r3, [r10, #12] //
pld [r10, #64] // prefetch source data
// ldrd r8, lr, [sp] // fetch pbDst/pbSrcEnd
ldr r8, [sp]
ldr lr, [sp, #4]
eors r0, r0, r4 // exclusive-OR against encrypt results
eors r1, r1, r5 //
eors r2, r2, r6 //
eors r3, r3, r7 //
str r0, [r8, #0] // store to destination
str r1, [r8, #4] //
str r2, [r8, #8] //
str r3, [r8, #12] //
ldrd r0, r1, [sp, #16] // load chaining state
ldrd r2, r3, [sp, #24] //
add r8, r8, #16 // pbDst += 16
add r10, r10, #16 // pbSrc += 16
str r8, [sp] // save pbDst
rev r3, r3 // reverse the second qword
rev r2, r2 //
adds r3, r3, #1 // increment the counter
adcs r2, r2, #0 //
rev r3, r3 // re-reverse the second word
rev r2, r2 //
strd r2, r3, [sp, #24] // write updated state
cmp r10, lr // done?
blo SymCryptAesCtrMsb64AsmLoop // loop until finished
ldr r0, [sp, #36] // get pbChainingValue
movs r1, #0 // get 0 in r1
str r2, [r0, #8] // write back modified part of chaining state
str r3, [r0, #12] //
// strd r1, r1, [sp, #16] // wipe the stack copy
// strd r1, r1, [sp, #24] //
str r1, [sp, #16]
str r1, [sp, #20]
str r1, [sp, #24]
str r1, [sp, #28]
SymCryptAesCtrMsb64NoData:
add sp, sp, #32
pop {r0-r2}
// pop {r0-r2, r4-r11, pc} // return
// NESTED_END SymCryptAesCtrMsb64Asm
FUNCTION_END(SymCryptAesCtrMsb64Asm)

Просмотреть файл

@ -0,0 +1,790 @@
//
// fdef_asm.cppasm Assembler code for large integer arithmetic in the default data format for the arm architecture
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
// #include "ksarm.h"
#include "C_asm_shared.inc"
// A digit consists of 4 words of 32 bits each
//UINT32
//SYMCRYPT_CALL
// SymCryptFdefRawAdd(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
// UINT32 nDigits )
//
// Initial inputs to registers:
// pSrc1 -> r0
// pSrc2 -> r1
// pDst -> r2
// nDigits -> r3
// LEAF_ENTRY SymCryptFdefRawAddAsm
FUNCTION_START(SymCryptFdefRawAddAsm, 4, 10)
// push {r4-r9, lr}
neg r3, r3 // negate the digit count
mov r8, #0 // carry = r8 = 0
mov r9, #0 // r9 = 0
SymCryptFdefRawAddAsmLoop:
rrxs r8, r8 // set the carry flag if bit[0] of r8 is set
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
adcs r4, r4, r5
adcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
adcs r4, r4, r5
adcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
adc r8, r9, r9 // r8 = 1 if the carry flag is set
adds r3, r3, #1 // Increment the digit count by one
bne SymCryptFdefRawAddAsmLoop
mov r0, r8 // Set the return value equal to the carry
// pop {r4-r9, pc}
// LEAF_END SymCryptFdefRawAddAsm
FUNCTION_END(SymCryptFdefRawAddAsm)
//UINT32
//SYMCRYPT_CALL
//SymCryptFdefRawSub(
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
// UINT32 nDigits )
//
// Initial inputs to registers:
// pSrc1 -> r0
// pSrc2 -> r1
// pDst -> r2
// nDigits -> r3
// LEAF_ENTRY SymCryptFdefRawSubAsm
FUNCTION_START(SymCryptFdefRawSubAsm, 4, 10)
// push {r4-r9, lr}
neg r3, r3 // negate the digit count
mov r8, #0 // borrow = r8 = 0
mov r9, #0 // r9 = 0
SymCryptFdefRawSubAsmLoop:
subs r8, r9, r8 // if r8>0 then the "borrow flag" is set
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
sbcs r4, r4, r5
sbcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
sbcs r4, r4, r5
sbcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
sbc r8, r9, r9 // If borrow=1, then r8 = -1 = 0xffffffff
adds r3, r3, #1 // Increment the digit count by one
bne SymCryptFdefRawSubAsmLoop
and r0, r8, #1 // If r8>0, set the return value to 1
// pop {r4-r9, pc}
// LEAF_END SymCryptFdefRawSubAsm
FUNCTION_END(SymCryptFdefRawSubAsm)
//VOID
//SYMCRYPT_CALL
//SymCryptFdefMaskedCopy(
// _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
// _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
// UINT32 nDigits,
// UINT32 mask )
// LEAF_ENTRY SymCryptFdefMaskedCopyAsm
FUNCTION_START(SymCryptFdefMaskedCopyAsm, 4, 10)
// push {r4-r9, lr}
neg r2, r2 // negate the digit count
mov r9, #0 // r9 = 0
subs r4, r9, r3 // If (r3 > 0) clear the carry flag (i.e. borrow)
sbc r3, r9, r9 // r3 = mask = 0xffffffff if the carry flag is clear
// orn r9, r9, r3 // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
orr r9, r9, r3
mvn r9, r9
mov r8, r1 // save the destination pointer
SymCryptFdefMaskedCopyAsmLoop:
ldmia r0!, {r4, r6} // Load two words of the source
ldmia r1!, {r5, r7} // Load two words of the destination
and r4, r4, r3
and r5, r5, r9
orr r4, r4, r5
and r6, r6, r3
and r7, r7, r9
orr r6, r6, r7
stmia r8!, {r4, r6} // Store the two words in the destination
ldmia r0!, {r4, r6} // Load two words of the source
ldmia r1!, {r5, r7} // Load two words of the destination
and r4, r4, r3
and r5, r5, r9
orr r4, r4, r5
and r6, r6, r3
and r7, r7, r9
orr r6, r6, r7
stmia r8!, {r4, r6} // Store the two words in the destination
adds r2, r2, #1 // Increment the digit count by one
bne SymCryptFdefMaskedCopyAsmLoop
// Done, no return value
// pop {r4-r9, pc}
// LEAF_END SymCryptFdefMaskedCopyAsm
FUNCTION_END(SymCryptFdefMaskedCopyAsm)
//VOID
//SYMCRYPT_CALL
//SymCryptFdefRawMul(
// _In_reads_(nWords1) PCUINT32 pSrc1,
// UINT32 nDigits1,
// _In_reads_(nWords2) PCUINT32 pSrc2,
// UINT32 nDigits2,
// _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
//
// Initial inputs to registers:
// pSrc1 -> r0
// nDigits1 -> r1
// pSrc2 -> r2
// nDigits2 -> r3
// pDst -> In the stack
//
// Basic structure:
// for each 2 words in Src1:
// Dst += Src2 * (2 words of Src1)
//
// Register assignments
// r0 = pSrc1 (moving forward one word every outer loop)
// r1 = negated word count of pSrc1
// r2 = pSrc2 (moving forward one *digit* every inner loop)
// r3 = negated digit count of pSrc2 and pDst
// r4 = pDst (moving forward one *digit* every inner loop)
// r5 = Stored pDst (moving forward one word every outer loop)
// r6, r7 = Current words loaded from pSrc1
// r8, r9 = Current words loaded from pSrc2
// <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
//
// Stack assignments
#define pSrc2 0 // Stored pSrc2 in stack
#define nDigits2 4 // Stored negated digit count of pSrc2 in stack
// LEAF_ENTRY SymCryptFdefRawMulAsm
FUNCTION_START(SymCryptFdefRawMulAsm, 4, 13)
// push {r4-r12, lr}
sub sp, sp, #8
lsl r1, r1, #2 // Calculate word count
ldr r4, [sp, #(8+4*10)] // load pDst
neg r1, r1 // negate nWords1
neg r3, r3 // negate nDigits2
mov r5, r4 // store pDst
str r2, [sp, #pSrc2] // store pSrc2
str r3, [sp, #nDigits2] // store -nDigits2 for later
//
// First iteration of main loop (no adding of previous values from pDst)
//
mov r11, #0 // Setting r11 = 0
mov r12, #0 // and r12 = 0
ldmia r0!, {r6, r7} // Load two words from pSrc1
SymCryptFdefRawMulAsmLoopInner1:
adds r3, r3, #1 // move one digit up
ldmia r2!, {r8, r9} // Load two words from pSrc2
mov r10, #0 // Setting r10 = 0
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
mov r10, #0 // Setting r10 = 0
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
ldmia r2!, {r8, r9} // Load two words from pSrc2
mov r10, #0 // Setting r10 = 0
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
mov r10, #0 // Setting r10 = 0
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
bne SymCryptFdefRawMulAsmLoopInner1
stmia r4, {r11, r12} // Store the top two words in the destination
add r1, r1, #2 // move two words up
add r5, r5, #8 // move start of pDst two words up
//
// MAIN LOOP
//
SymCryptFdefRawMulAsmLoopOuter:
ldr r3, [sp, #nDigits2] // set -nDigits2
ldr r2, [sp, #pSrc2] // set pSrc2
mov r4, r5 // set pDst
mov r11, #0 // Setting r11 = 0
mov r12, #0 // and r12 = 0
ldmia r0!, {r6, r7} // Load two words from pSrc1
SymCryptFdefRawMulAsmLoopInner:
adds r3, r3, #1 // move one digit up
ldmia r2!, {r8, r9} // Load two words from pSrc2
ldr r10, [r4] // load 1 word from pDst
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
ldr r10, [r4] // load 1 word from pDst
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
ldmia r2!, {r8, r9} // Load two words from pSrc2
ldr r10, [r4] // load 1 word from pDst
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
ldr r10, [r4] // load 1 word from pDst
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
str r10, [r4], #4 // Store to destination
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
bne SymCryptFdefRawMulAsmLoopInner
adds r1, r1, #2 // move two words up
add r5, r5, #8 // move start of pDst two words up
stmia r4, {r11, r12} // Store the top two words in the destination
bne SymCryptFdefRawMulAsmLoopOuter
// Done, no return value
add sp, sp, #8
// pop {r4-r12, pc}
// LEAF_END SymCryptFdefRawMulAsm
FUNCTION_END(SymCryptFdefRawMulAsm)
// Macro for the first loop of the first pass of RawSquareAsm.
// It takes one word from the source, multiplies it with the mulword,
// adds the high level word of the previous macro call, and stores it into
// the destination.
//
// No word is taken from the destination// thus r10 is always set to 0.
//
// No carry flag is propagated from the previous macro call as the maximum is
// (2^32-1)^2 + 2^32-1 = 2^64 - 2^32
MACRO_START(SQR_SINGLEADD_32, index)
mov r10, #0
ldr r8, [r2, #4*index] // pSrc[i+j]
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4, #4*index] // Store to destination
MACRO_END()
// Macro for the remaining loops of the first pass of RawSquareAsm.
// The only difference to the above is that it also adds the word loaded
// from the destination buffer.
//
// No carry flag is propagated from the previous macro call as the maximum is
// (2^32-1)^2 + 2(2^32-1) = 2^64 - 1
MACRO_START(SQR_DOUBLEADD_32, index)
ldr r8, [r2, #4*index] // pSrc[i+j]
ldr r10, [r4, #4*index] // pDst[2*(i+j)]
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
str r10, [r4, #4*index] // Store to destination
MACRO_END()
// Macro for the third pass loop of RawSquareAsm.
// It takes one mulword from the source, squares it, and
// adds it to the even columns of the destination. The carries are propagated
// to the odd columns.
//
// Here we can have a (1-bit) carry to the next call because the maximum value for
// a pair of columns is (2^32-1)^2+(2^64-1)+1 = 2^65 - 2^33 + 1 < 2^65 - 1
MACRO_START(SQR_DIAGONAL_PROP, index)
ldr r6, [r0, #4*index] // mulword
umull r10, r11, r6, r6
ldr r8, [r4, #8*index] // Load
ldr r9, [r4, #8*index + 4] // Load
// Adding the square to the even column
adcs r10, r10, r8 // carry from previous and update the flags
// Propagating the sum to the next column
adcs r11, r11, r9 // This can generate a carry
str r10, [r4, #8*index] // Store
str r11, [r4, #8*index + 4]// Store
MACRO_END()
// VOID
// SYMCRYPT_CALL
// SymCryptFdefRawSquareAsm(
// _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
// UINT32 nDigits,
// _Out_writes_(2*nWords) PUINT32 pDst )
//
// Initial inputs to registers:
// pSrc -> r0
// nDigits -> r1
// pDst -> r2
//
// Register assignments
// r0 = pSrc
// r1 = negated word count of pSrc
// r2 = pSrc (moving forward one digit / 4 words every inner loop)
// r3 = negated digit count of pSrc
// r4 = pDst (moving forward one digit every inner loop)
// r5 = pDst (moving forward one word every outer loop)
// r6 = mulword from pSrc
// r7 = Stored negated digit count of pSrc
// r8 = Current words loaded from pSrc
// r9 = Cyclic counter for the jumps
// r10, r11 = "64-bit" sliding register to hold the result of multiplies,
// r10 also receives a word from pDst
// r12 = Negated digit counter of pSrc (updated every 4 iterations of main loop)
//
// Stack assignments
#define pDstSq 0 // Stored pDst in stack
#define pSrc 4 // Stored pSrc in stack
// LEAF_ENTRY SymCryptFdefRawSquareAsm
FUNCTION_START(SymCryptFdefRawSquareAsm, 3, 13)
// push {r4-r12, lr}
sub sp, sp, #8
mov r3, r1 // digit count into r3
lsl r1, r1, #2 // Calculate word count
neg r1, r1 // negate nWords
neg r3, r3 // negate nDigitsSq
mov r4, r2 // pDst
mov r5, r2 // store pDst
str r0, [sp, #pSrc] // store pSrc
str r5, [sp, #pDstSq] // store pDst
mov r7, r3 // store -nDigits for later
mov r12, r3 // Negated digit counter of pSrc
mov r2, r0 // inner loop pSrc
//
// First iteration of main loop (no adding of previous values from pDst)
//
ands r11, r11, #0 // Clearing the carry flag and setting r11 = 0
ldr r6, [r0] // load the first word from pSrc1
str r11, [r4] // store 0 for the first word
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
SymCryptFdefRawSquareAsmInnerLoopInit_Word0:
SQR_SINGLEADD_32 0
SymCryptFdefRawSquareAsmInnerLoopInit_Word1:
SQR_SINGLEADD_32 1
SQR_SINGLEADD_32 2
SQR_SINGLEADD_32 3
add r2, r2, #16
add r4, r4, #16
adds r3, r3, #1 // move one digit up
bne SymCryptFdefRawSquareAsmInnerLoopInit_Word0
str r11, [r4] // Store the next word into the destination
add r1, r1, #2 // move two words up (so we stop when real word count is "-1")
mov r9, #1 // Cyclic counter
//
// MAIN LOOP
//
SymCryptFdefRawSquareAsmOuterLoop:
add r5, r5, #4 // move start of pDst one word up
mov r3, r12 // set -nDigits
mov r2, r0 // set pSrc
mov r4, r5 // set pDst
ands r11, r11, #0 // Clearing the carry flag and setting r11 = 0
ldr r6, [r0, r9, LSL #2] // load the next word from pSrc
// Cyclic counter and jump logic
add r9, r9, #1
cmp r9, #1
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
cmp r9, #2
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
cmp r9, #3
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
// The following instructions are only executed when r9 == 4
mov r9, #0 // Set it to 0
add r0, r0, #16 // move start of pSrc 4 words up
add r5, r5, #16 // move pDst 4 words up
mov r2, r0 // set pSrc
mov r4, r5 // set pDst
adds r3, r3, #1 // add 1 digit
mov r12, r3 // set the new digit counter
SymCryptFdefRawSquareAsmInnerLoop_Word0:
SQR_DOUBLEADD_32 0
SymCryptFdefRawSquareAsmInnerLoop_Word1:
SQR_DOUBLEADD_32 1
SymCryptFdefRawSquareAsmInnerLoop_Word2:
SQR_DOUBLEADD_32 2
SymCryptFdefRawSquareAsmInnerLoop_Word3:
SQR_DOUBLEADD_32 3
add r2, r2, #16
add r4, r4, #16
adds r3, r3, #1 // move one digit up
bne SymCryptFdefRawSquareAsmInnerLoop_Word0
str r11, [r4] // Store the next word into the destination
adds r1, r1, #1 // move one word up
bne SymCryptFdefRawSquareAsmOuterLoop
eor r11, r11, r11 // Setting r11 = 0
str r11, [r5, #20] // Store 0 to destination for the top word
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Second Pass - Shifting all results 1 bit left
// Third Pass - Adding the squares on the even columns and propagating the sum
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
mov r3, r7 // -nDigits
lsl r3, r3, #1 // Double digits
ldr r4, [sp, #pDstSq] // pDst pointer
ands r1, r1, #0 // Clear the flags
ands r2, r2, #0 // Clear the flags
SymCryptFdefRawSquareAsmSecondPass:
rrxs r2, r2 // set the carry flag if bit[0] of r2 is set
ldmia r4, {r8, r9}
adcs r8, r8, r8 // Shift left and add the carry
adcs r9, r9, r9
stmia r4!, {r8, r9}
ldmia r4, {r10, r11}
adcs r10, r10, r10 // Shift left and add the carry
adcs r11, r11, r11
stmia r4!, {r10, r11}
adc r2, r1, r1
adds r3, r3, #1 // move one digit up
bne SymCryptFdefRawSquareAsmSecondPass
ldr r0, [sp, #pSrc] // src pointer
ldr r4, [sp, #pDstSq] // pDst pointer
// mov r3, r7 // Use r7 as the digit counter
// ands r1, r1, #0 // Clear the flags
ands r2, r2, #0 // Clear the flags
SymCryptFdefRawSquareAsmThirdPass:
rrxs r2, r2 // set the carry flag if bit[0] of r2 is set
SQR_DIAGONAL_PROP 0
SQR_DIAGONAL_PROP 1
SQR_DIAGONAL_PROP 2
SQR_DIAGONAL_PROP 3
adc r2, r1, r1
add r0, r0, #16 // One digit up (not updated in SQR_DIAGONAL_PROP)
add r4, r4, #32 // Two digits up (not updated in SQR_DIAGONAL_PROP)
adds r7, r7, #1 // move one digit up
bne SymCryptFdefRawSquareAsmThirdPass
// Done, no return value
add sp, sp, #8
// pop {r4-r12, pc}
// LEAF_END SymCryptFdefRawSquareAsm
FUNCTION_END(SymCryptFdefRawSquareAsm)
//VOID
//SymCryptFdefMontgomeryReduceAsm(
// _In_ PCSYMCRYPT_MODULUS pmMod,
// _Inout_ PUINT32 pSrc,
// _Out_ PUINT32 pDst )
//
// Initial inputs to registers:
// pmMod -> r0
// pSrc -> r1
// pDst -> r2
//
// Register assignments
// r0 = pMod (moving forward one *digit* every inner loop)
// r1 = pSrc (moving forward one *digit* every inner loop)
// r2 = Stored pSrc (moving forward one word every outer loop)
// r3 = negated digit count of pSrc and pMod
// r4 = negated word count of pSrc
// r5, r6 = m = pSrc[i]*Inv64
// r7 = hc = high carry variable
// r8, r9 = Current words loaded from pMod
// <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
// Stack assignments
#define pMod 0 // Stored pMod
#define pDst 4 // Stored pDst
#define nDigits 8 // Stored negated digit count of pSrc
#define inv64 12 // Inv64 of modulus
// LEAF_ENTRY SymCryptFdefMontgomeryReduceAsm
FUNCTION_START(SymCryptFdefMontgomeryReduceAsm, 3, 13)
// push {r4-r12, lr}
sub sp, sp, #16
str r2, [sp, #pDst] // Store pDst in the stack
ldr r3, [r0, #SymCryptModulusNdigitsOffsetArm] // # of Digits
ldr r5, [r0, #SymCryptModulusInv64OffsetArm] // Inv64 of modulus
add r0, r0, #SymCryptModulusValueOffsetArm // pMod
str r5, [sp, #inv64] // Store inv64 in the stack
lsl r4, r3, #2 // Multiply by 4 to get the number of words
neg r3, r3 // Negate the digit count
neg r4, r4 // Negate the word count
str r0, [sp, #pMod] // Store the pMod pointer
mov r2, r1 // Store the pSrc pointer
str r3, [sp, #nDigits] // Store the digit count for later
eor r7, r7, r7 // Set hc to 0
//
// Main loop
//
SymCryptFdefMontgomeryReduceAsmOuter:
ldr r3, [sp, #inv64] // Inv64 of modulus
ldmia r1, {r10, r12} // Load two words from pSrc
ldmia r0, {r8,r9} // Load two words from pMod
mov r11, #0
mul r5, r10, r3 // <31:0> bits of pSrc[i]*Inv64 = m1 (first multiplier)
umaal r10, r11, r5, r8 // r11 <-- High( m1*pMod[0] + pSrc[i] )
umaal r12, r11, r5, r9 // Calculate pSrc[i+1] = Low( m1*pMod[1] + pSrc[i+1] + High( m1*pMod[0] + pSrc[i] ))
mul r6, r12, r3 // <31:0> bits of pSrc[i+1]*Inv64 = m2
ldr r3, [sp, #nDigits] // Reset the digit counter
mov r11, #0 // Set c to 0
mov r12, #0 // Set c to 0
SymCryptFdefMontgomeryReduceAsmInner:
adds r3, r3, #1 // Move one digit up (none of the commands updates the carry)
ldmia r0!, {r8, r9} // Load two words from pMod[]
ldr r10, [r1] // pSrc[j]
umaal r10, r11, r5, r8 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
str r10, [r1], #4 // pSrc[j] = (UINT32) c
umaal r11, r12, r6, r8 // c = <r12:r11> = m2 * pMod[j] + c
ldr r10, [r1] // pSrc[j]
umaal r10, r11, r5, r9 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
str r10, [r1], #4 // pSrc[j] = (UINT32) c
umaal r11, r12, r6, r9 // c = <r12:r11> = m2 * pMod[j] + c
ldmia r0!, {r8, r9} // Load two words from pMod[]
ldr r10, [r1] // pSrc[j]
umaal r10, r11, r5, r8 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
str r10, [r1], #4 // pSrc[j] = (UINT32) c
umaal r11, r12, r6, r8 // c = <r12:r11> = m2 * pMod[j] + c
ldr r10, [r1] // pSrc[j]
umaal r10, r11, r5, r9 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
str r10, [r1], #4 // pSrc[j] = (UINT32) c
umaal r11, r12, r6, r9 // c = <r12:r11> = m2 * pMod[j] + c
bne SymCryptFdefMontgomeryReduceAsmInner
mov r8, #0 // r8 = 0
mov r9, #0 // r9 = 0
ldmia r1, {r5, r6} // Load pSrc[nWords] and pSrc[nWords+1]
adds r11, r11, r5 // c + pSrc[nWords]
adc r8, r8, #0 // Add the carry if any
adds r11, r11, r7 // c + pSrc[nWords] + hc
adc r8, r8, #0 // Add the carry if any
str r11, [r1], #4 // pSrc[nWords] = c
adds r12, r12, r6 // c + pSrc[nWords+1]
adc r9, r9, #0 // Add the carry if any
adds r12, r12, r8 // c + pSrc[nWords] + hc
adc r7, r9, #0 // Add the carry if any
str r12, [r1] // pSrc[nWords+1] = c
adds r4, r4, #2 // Move two words up
add r2, r2, #8 // Move stored pSrc pointer two words up
ldr r0, [sp, #pMod] // Restore the pMod pointer
mov r1, r2 // Restore the pSrc pointer
bne SymCryptFdefMontgomeryReduceAsmOuter
//
// Subtraction
//
// Prepare the pointers for subtract
mov r0, r2 // pSrc
mov r11, r2 // Store pSrc for later
ldr r1, [sp, #pMod] // pMod
ldr r2, [sp, #pDst] // pDst
ldr r3, [sp, #nDigits] // Reset the digit counter
mov r10, r7 // r10 = hc
mov r8, #0 // borrow = r8 = 0
mov r9, #0 // r9 = 0
SymCryptFdefMontgomeryReduceRawSubAsmLoop:
subs r8, r9, r8 // if r8>0 then the "borrow flag" is set
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
sbcs r4, r4, r5
sbcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
ldmia r0!, {r4, r6} // Load two words of pSrc1
ldmia r1!, {r5, r7} // Load two words of pSrc2
sbcs r4, r4, r5
sbcs r6, r6, r7
stmia r2!, {r4, r6} // Store the result in the destination
sbc r8, r9, r9 // If borrow=1, then r8 = -1 = 0xffffffff
adds r3, r3, #1 // Increment the digit count by one
bne SymCryptFdefMontgomeryReduceRawSubAsmLoop
// Prepare the pointers for masked copy
mov r0, r11 // pSrc
ldr r1, [sp, #pDst] // pDst
and r9, r8, #1 // If r8>0, set the return value to 1
orr r11, r10, r9 // r11 = hc|d
ldr r2, [sp, #nDigits] // Restore the digit counter
mov r9, #0 // r9 = 0
subs r4, r10, r11 // If (r11 > r10) clear the carry flag (i.e. borrow)
sbc r3, r9, r9 // r3 = mask = 0xffffffff if the carry flag is clear
// orn r9, r9, r3 // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
orr r9, r9, r3
mvn r9, r9
mov r8, r1 // save the destination pointer
SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop:
ldmia r0!, {r4, r6} // Load two words of the source
ldmia r1!, {r5, r7} // Load two words of the destination
and r4, r4, r3
and r5, r5, r9
orr r4, r4, r5
and r6, r6, r3
and r7, r7, r9
orr r6, r6, r7
stmia r8!, {r4, r6} // Store the two words in the destination
ldmia r0!, {r4, r6} // Load two words of the source
ldmia r1!, {r5, r7} // Load two words of the destination
and r4, r4, r3
and r5, r5, r9
orr r4, r4, r5
and r6, r6, r3
and r7, r7, r9
orr r6, r6, r7
stmia r8!, {r4, r6} // Store the two words in the destination
adds r2, r2, #1 // Increment the digit count by one
bne SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
// Done, no return value
add sp, sp, #16
// pop {r4-r12, pc}
// LEAF_END SymCryptFdefMontgomeryReduceAsm
FUNCTION_END(SymCryptFdefMontgomeryReduceAsm)

20
lib/arm/wipe.symcryptasm Normal file
Просмотреть файл

@ -0,0 +1,20 @@
// SymCryptWipe
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
// Secure wipe
//
// VOID
// SYMCRYPT_CALL
// SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
// SIZE_T cbData )
FUNCTION_START(SymCryptWipeAsm, 2, 0)
// we just jump to memset.
// this is enough to stop the compiler optimizing the memset away.
mov r2, r1
mov r1, #0
bl memset
FUNCTION_END(SymCryptWipeAsm)

Просмотреть файл

@ -825,48 +825,46 @@ SymCryptParallelSha256AppendBlocks_neon(
//
// This can probably be done faster, but we are missing the VTRN.64 instruction
// which makes it hard to do this efficient in intrinsics.
// The vsetq_lane_u32 seems to be completely ignored by the compiler.
//
ha[7].n128_u32[0] = pChain[0]->H[0];
ha[7].n128_u32[1] = pChain[1]->H[0];
ha[7].n128_u32[2] = pChain[2]->H[0];
ha[7].n128_u32[3] = pChain[3]->H[0];
ha[6].n128_u32[0] = pChain[0]->H[1];
ha[6].n128_u32[1] = pChain[1]->H[1];
ha[6].n128_u32[2] = pChain[2]->H[1];
ha[6].n128_u32[3] = pChain[3]->H[1];
ha[5].n128_u32[0] = pChain[0]->H[2];
ha[5].n128_u32[1] = pChain[1]->H[2];
ha[5].n128_u32[2] = pChain[2]->H[2];
ha[5].n128_u32[3] = pChain[3]->H[2];
ha[4].n128_u32[0] = pChain[0]->H[3];
ha[4].n128_u32[1] = pChain[1]->H[3];
ha[4].n128_u32[2] = pChain[2]->H[3];
ha[4].n128_u32[3] = pChain[3]->H[3];
ha[3].n128_u32[0] = pChain[0]->H[4];
ha[3].n128_u32[1] = pChain[1]->H[4];
ha[3].n128_u32[2] = pChain[2]->H[4];
ha[3].n128_u32[3] = pChain[3]->H[4];
ha[2].n128_u32[0] = pChain[0]->H[5];
ha[2].n128_u32[1] = pChain[1]->H[5];
ha[2].n128_u32[2] = pChain[2]->H[5];
ha[2].n128_u32[3] = pChain[3]->H[5];
ha[1].n128_u32[0] = pChain[0]->H[6];
ha[1].n128_u32[1] = pChain[1]->H[6];
ha[1].n128_u32[2] = pChain[2]->H[6];
ha[1].n128_u32[3] = pChain[3]->H[6];
ha[0].n128_u32[0] = pChain[0]->H[7];
ha[0].n128_u32[1] = pChain[1]->H[7];
ha[0].n128_u32[2] = pChain[2]->H[7];
ha[0].n128_u32[3] = pChain[3]->H[7];
ha[7] = vsetq_lane_u32( pChain[0]->H[0], ha[7], 0 );
ha[7] = vsetq_lane_u32( pChain[1]->H[0], ha[7], 1 );
ha[7] = vsetq_lane_u32( pChain[2]->H[0], ha[7], 2 );
ha[7] = vsetq_lane_u32( pChain[3]->H[0], ha[7], 3 );
ha[6] = vsetq_lane_u32( pChain[0]->H[1], ha[6], 0 );
ha[6] = vsetq_lane_u32( pChain[1]->H[1], ha[6], 1 );
ha[6] = vsetq_lane_u32( pChain[2]->H[1], ha[6], 2 );
ha[6] = vsetq_lane_u32( pChain[3]->H[1], ha[6], 3 );
ha[5] = vsetq_lane_u32( pChain[0]->H[2], ha[5], 0 );
ha[5] = vsetq_lane_u32( pChain[1]->H[2], ha[5], 1 );
ha[5] = vsetq_lane_u32( pChain[2]->H[2], ha[5], 2 );
ha[5] = vsetq_lane_u32( pChain[3]->H[2], ha[5], 3 );
ha[4] = vsetq_lane_u32( pChain[0]->H[3], ha[4], 0 );
ha[4] = vsetq_lane_u32( pChain[1]->H[3], ha[4], 1 );
ha[4] = vsetq_lane_u32( pChain[2]->H[3], ha[4], 2 );
ha[4] = vsetq_lane_u32( pChain[3]->H[3], ha[4], 3 );
ha[3] = vsetq_lane_u32( pChain[0]->H[4], ha[3], 0 );
ha[3] = vsetq_lane_u32( pChain[1]->H[4], ha[3], 1 );
ha[3] = vsetq_lane_u32( pChain[2]->H[4], ha[3], 2 );
ha[3] = vsetq_lane_u32( pChain[3]->H[4], ha[3], 3 );
ha[2] = vsetq_lane_u32( pChain[0]->H[5], ha[2], 0 );
ha[2] = vsetq_lane_u32( pChain[1]->H[5], ha[2], 1 );
ha[2] = vsetq_lane_u32( pChain[2]->H[5], ha[2], 2 );
ha[2] = vsetq_lane_u32( pChain[3]->H[5], ha[2], 3 );
ha[1] = vsetq_lane_u32( pChain[0]->H[6], ha[1], 0 );
ha[1] = vsetq_lane_u32( pChain[1]->H[6], ha[1], 1 );
ha[1] = vsetq_lane_u32( pChain[2]->H[6], ha[1], 2 );
ha[1] = vsetq_lane_u32( pChain[3]->H[6], ha[1], 3 );
ha[0] = vsetq_lane_u32( pChain[0]->H[7], ha[0], 0 );
ha[0] = vsetq_lane_u32( pChain[1]->H[7], ha[0], 1 );
ha[0] = vsetq_lane_u32( pChain[2]->H[7], ha[0], 2 );
ha[0] = vsetq_lane_u32( pChain[3]->H[7], ha[0], 3 );
buf[0] = ha[4];
buf[1] = ha[5];
@ -881,10 +879,10 @@ SymCryptParallelSha256AppendBlocks_neon(
//
for( r=0; r<16; r ++ )
{
T0.n128_u32[0] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ); ppByte[0] += 4;
T0.n128_u32[1] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ); ppByte[1] += 4;
T0.n128_u32[2] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ); ppByte[2] += 4;
T0.n128_u32[3] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ); ppByte[3] += 4;
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ), T0, 0 ); ppByte[0] += 4;
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ), T0, 1 ); ppByte[1] += 4;
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ), T0, 2 ); ppByte[2] += 4;
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ), T0, 3 ); ppByte[3] += 4;
W[r] = T0;
}
@ -964,47 +962,46 @@ SymCryptParallelSha256AppendBlocks_neon(
//
// Copy the chaining state back into the hash structure
//
pChain[0]->H[0] = ha[7].n128_u32[0];
pChain[1]->H[0] = ha[7].n128_u32[1];
pChain[2]->H[0] = ha[7].n128_u32[2];
pChain[3]->H[0] = ha[7].n128_u32[3];
pChain[0]->H[1] = ha[6].n128_u32[0];
pChain[1]->H[1] = ha[6].n128_u32[1];
pChain[2]->H[1] = ha[6].n128_u32[2];
pChain[3]->H[1] = ha[6].n128_u32[3];
pChain[0]->H[2] = ha[5].n128_u32[0];
pChain[1]->H[2] = ha[5].n128_u32[1];
pChain[2]->H[2] = ha[5].n128_u32[2];
pChain[3]->H[2] = ha[5].n128_u32[3];
pChain[0]->H[3] = ha[4].n128_u32[0];
pChain[1]->H[3] = ha[4].n128_u32[1];
pChain[2]->H[3] = ha[4].n128_u32[2];
pChain[3]->H[3] = ha[4].n128_u32[3];
pChain[0]->H[4] = ha[3].n128_u32[0];
pChain[1]->H[4] = ha[3].n128_u32[1];
pChain[2]->H[4] = ha[3].n128_u32[2];
pChain[3]->H[4] = ha[3].n128_u32[3];
pChain[0]->H[5] = ha[2].n128_u32[0];
pChain[1]->H[5] = ha[2].n128_u32[1];
pChain[2]->H[5] = ha[2].n128_u32[2];
pChain[3]->H[5] = ha[2].n128_u32[3];
pChain[0]->H[6] = ha[1].n128_u32[0];
pChain[1]->H[6] = ha[1].n128_u32[1];
pChain[2]->H[6] = ha[1].n128_u32[2];
pChain[3]->H[6] = ha[1].n128_u32[3];
pChain[0]->H[7] = ha[0].n128_u32[0];
pChain[1]->H[7] = ha[0].n128_u32[1];
pChain[2]->H[7] = ha[0].n128_u32[2];
pChain[3]->H[7] = ha[0].n128_u32[3];
pChain[0]->H[0] = vgetq_lane_u32( ha[7], 0 );
pChain[1]->H[0] = vgetq_lane_u32( ha[7], 1 );
pChain[2]->H[0] = vgetq_lane_u32( ha[7], 2 );
pChain[3]->H[0] = vgetq_lane_u32( ha[7], 3 );
pChain[0]->H[1] = vgetq_lane_u32( ha[6], 0 );
pChain[1]->H[1] = vgetq_lane_u32( ha[6], 1 );
pChain[2]->H[1] = vgetq_lane_u32( ha[6], 2 );
pChain[3]->H[1] = vgetq_lane_u32( ha[6], 3 );
pChain[0]->H[2] = vgetq_lane_u32( ha[5], 0 );
pChain[1]->H[2] = vgetq_lane_u32( ha[5], 1 );
pChain[2]->H[2] = vgetq_lane_u32( ha[5], 2 );
pChain[3]->H[2] = vgetq_lane_u32( ha[5], 3 );
pChain[0]->H[3] = vgetq_lane_u32( ha[4], 0 );
pChain[1]->H[3] = vgetq_lane_u32( ha[4], 1 );
pChain[2]->H[3] = vgetq_lane_u32( ha[4], 2 );
pChain[3]->H[3] = vgetq_lane_u32( ha[4], 3 );
pChain[0]->H[4] = vgetq_lane_u32( ha[3], 0 );
pChain[1]->H[4] = vgetq_lane_u32( ha[3], 1 );
pChain[2]->H[4] = vgetq_lane_u32( ha[3], 2 );
pChain[3]->H[4] = vgetq_lane_u32( ha[3], 3 );
pChain[0]->H[5] = vgetq_lane_u32( ha[2], 0 );
pChain[1]->H[5] = vgetq_lane_u32( ha[2], 1 );
pChain[2]->H[5] = vgetq_lane_u32( ha[2], 2 );
pChain[3]->H[5] = vgetq_lane_u32( ha[2], 3 );
pChain[0]->H[6] = vgetq_lane_u32( ha[1], 0 );
pChain[1]->H[6] = vgetq_lane_u32( ha[1], 1 );
pChain[2]->H[6] = vgetq_lane_u32( ha[1], 2 );
pChain[3]->H[6] = vgetq_lane_u32( ha[1], 3 );
pChain[0]->H[7] = vgetq_lane_u32( ha[0], 0 );
pChain[1]->H[7] = vgetq_lane_u32( ha[0], 1 );
pChain[2]->H[7] = vgetq_lane_u32( ha[0], 2 );
pChain[3]->H[7] = vgetq_lane_u32( ha[0], 3 );
SymCryptWipeKnownSize( buf, sizeof( buf ) );
}

Просмотреть файл

@ -37,7 +37,7 @@ include ksamd64.inc
#define GET_SYMBOL_ADDRESS(__symbol) __symbol@plt+rip
#define HEX(__constant) 0x##__constant
#define TEXTAREA()
#define EXTERN(__label)
#define EXTERN(__label) .extern __label
#define LABEL(__labelname) __labelname:
#else

Просмотреть файл

@ -19,7 +19,7 @@ set(KEEP_SYMBOL_ARGS
)
# Determine the which executable to use for stripping binaries
if(SYMCRYPT_TARGET_ARCH MATCHES ARM64 AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
if(SYMCRYPT_TARGET_ARCH MATCHES ARM AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
set(STRIP_COMMAND ${TARGET_TRIPLE}-strip)
set(OBJCOPY_COMMAND ${TARGET_TRIPLE}-objcopy)
else()

Просмотреть файл

@ -8,8 +8,152 @@
#include "precomp.h"
#if UINTPTR_MAX == 0xFFFFFFFF
#define Elf_Shdr Elf32_Shdr
#define Elf_Phdr Elf32_Phdr
#define Elf_Sym Elf32_Sym
#define Elf_Dyn Elf32_Dyn
#define Elf_Ehdr Elf32_Ehdr
#define Elf_Addr Elf32_Addr
#define Elf_Off Elf32_Off
#define Elf_Rel Elf32_Rel
#define Elf_Rela Elf32_Rela
#define ELF_R_TYPE(X) ELF32_R_TYPE(X)
#define ELF_R_SYM(X) ELF32_R_SYM(X)
#define Elf_Word Elf32_Word
#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ32
#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
#define Elf_Shdr Elf64_Shdr
#define Elf_Phdr Elf64_Phdr
#define Elf_Sym Elf64_Sym
#define Elf_Dyn Elf64_Dyn
#define Elf_Ehdr Elf64_Ehdr
#define Elf_Addr Elf64_Addr
#define Elf_Off Elf64_Off
#define Elf_Rel Elf64_Rel
#define Elf_Rela Elf64_Rela
#define ELF_R_TYPE(X) ELF64_R_TYPE(X)
#define ELF_R_SYM(X) ELF64_R_SYM(X)
#define Elf_Word Elf64_Word
#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ64
#else
#error Unknown CPU pointer size
#endif
#ifdef SYMCRYPT_DEBUG_INTEGRITY
#include <stdio.h>
VOID
DbgDumpHex(PCBYTE pbData, SIZE_T cbData)
{
ULONG i,count;
CHAR digits[]="0123456789abcdef";
CHAR pbLine[256];
ULONG cbLine, cbHeader = 0;
ULONG_PTR address;
if(pbData == NULL && cbData != 0)
{
// strcat_s(pbLine, RTL_NUMBER_OF(pbLine), "<null> buffer!!!\n");
fprintf(stderr, "<null> buffer!!!\n");
return;
}
for(; cbData ; cbData -= count, pbData += count)
{
count = (cbData > 16) ? 16:cbData;
cbLine = cbHeader;
address = (ULONG_PTR)pbData;
#if UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
// 64 bit addresses.
pbLine[cbLine++] = digits[(address >> 0x3c) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x38) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x34) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x30) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x2c) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x28) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x24) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x20) & 0x0f];
#endif
pbLine[cbLine++] = digits[(address >> 0x1c) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x18) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x14) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x10) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x0c) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x08) & 0x0f];
pbLine[cbLine++] = digits[(address >> 0x04) & 0x0f];
pbLine[cbLine++] = digits[(address ) & 0x0f];
pbLine[cbLine++] = ' ';
pbLine[cbLine++] = ' ';
for(i = 0; i < count; i++)
{
pbLine[cbLine++] = digits[pbData[i]>>4];
pbLine[cbLine++] = digits[pbData[i]&0x0f];
if(i == 7)
{
pbLine[cbLine++] = ':';
}
else
{
pbLine[cbLine++] = ' ';
}
}
for(; i < 16; i++)
{
pbLine[cbLine++] = ' ';
pbLine[cbLine++] = ' ';
pbLine[cbLine++] = ' ';
}
pbLine[cbLine++] = ' ';
for(i = 0; i < count; i++)
{
if(pbData[i] < 32 || pbData[i] > 126)
{
pbLine[cbLine++] = '.';
}
else
{
pbLine[cbLine++] = pbData[i];
}
}
pbLine[cbLine++] = 0;
fprintf(stderr, "%s\n", pbLine);
}
}
VOID
SYMCRYPT_CALL
SymCryptHmacSha256AppendDbg(
_In_ CHAR* pszLabel,
_Inout_ PSYMCRYPT_HMAC_SHA256_STATE pState,
_In_reads_( cbData ) PCBYTE pbData,
SIZE_T cbData )
{
fprintf(stderr, "\nHMAC append: %s size %lx\n", pszLabel, cbData);
DbgDumpHexString(pbData, (ULONG)cbData);
SymCryptHmacSha256Append(pState, pbData, cbData);
}
#endif
// These placeholder vaulues must match the values in process_fips_module.py
#if UINTPTR_MAX == 0xFFFFFFFF
#define PLACEHOLDER_VALUE 0x8BADF00D
#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
#define PLACEHOLDER_VALUE 0x4BADF00D8BADF00D
#else
#error Unknown CPU pointer size
#endif
#define PLACEHOLDER_ARRAY \
{\
0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,\
@ -29,11 +173,11 @@
// Relative virtual address of the HMAC key. Used to calculate where the module starts in memory
// at runtime.
const Elf64_Addr SymCryptVolatileFipsHmacKeyRva = (Elf64_Addr) PLACEHOLDER_VALUE;
const Elf_Addr SymCryptVolatileFipsHmacKeyRva = (Elf_Addr) PLACEHOLDER_VALUE;
// Offset to the end of the FIPS module. Bytes after this offset are not considered part of our
// FIPS module and are not included in the HMAC digest.
const Elf64_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;
const Elf_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;
// Key used for HMAC.
const unsigned char SymCryptVolatileFipsHmacKey[32] = PLACEHOLDER_ARRAY;
@ -44,32 +188,43 @@ unsigned char SymCryptVolatileFipsHmacDigest[SYMCRYPT_HMAC_SHA256_RESULT_SIZE] =
typedef struct
{
Elf64_Rela* rela;
Elf_Rela* rela;
Elf_Rel* rel;
size_t relaEntryCount;
Elf64_Rela* pltRela;
size_t relEntryCount;
union {
Elf_Rela* rela;
Elf_Rel* rel;
Elf_Addr addr;
} plt;
size_t pltRelaEntryCount;
} Elf64_Rela_Info;
Elf_Addr pltRelAddendType;
} Elf_Rela_Info;
VOID SymCryptModuleUndoRelocation(
_In_ const Elf64_Addr module_base,
_Inout_ Elf64_Xword* const target,
_In_ const Elf64_Rela* rela )
_In_ const Elf_Addr module_base,
_Inout_ Elf_Addr* const target,
_In_ const Elf_Word relType )
{
Elf64_Xword replacement = 0;
Elf_Addr replacement = 0;
switch( ELF64_R_TYPE( rela->r_info ) )
switch( relType )
{
case R_X86_64_RELATIVE:
case R_AARCH64_RELATIVE:
replacement = *target - (Elf64_Off) module_base;
case R_ARM_RELATIVE:
replacement = *target - (Elf_Off) module_base;
break;
case R_X86_64_64:
case R_X86_64_GLOB_DAT:
case R_X86_64_JUMP_SLOT:
case R_AARCH64_GLOB_DAT:
case R_AARCH64_JUMP_SLOT:
// R_X86_64_64, R_X86_64_GLOB_DAT and R_AARCH64_GLOB_DAT relocations all have initial
// values of zero. R_X86_64_JUMP_SLOT and R_AARCH64_JUMP_SLOT relocations have initial
case R_ARM_GLOB_DAT:
case R_ARM_JUMP_SLOT:
// R_X86_64_64 and R_*_GLOB_DAT relocations all have initial
// values of zero. R_*_JUMP_SLOT relocations have initial
// values that point into the PLT, but we set these to zero in our post-processing
// script before HMACing the module. These relocation targets are excluded from our
// FIPS module boundary because they're used for external function calls, which we
@ -87,36 +242,51 @@ VOID SymCryptModuleUndoRelocation(
}
VOID SymCryptModuleFindRelocationInfo(
_In_ const Elf64_Dyn* const dynStart,
_Out_ Elf64_Rela_Info* relaInfo)
_In_ const Elf_Dyn* const dynStart,
_Out_ Elf_Rela_Info* relaInfo)
{
relaInfo->rela = NULL;
relaInfo->relaEntryCount = 0;
relaInfo->pltRela = NULL;
relaInfo->relEntryCount = 0;
relaInfo->plt.addr = 0;
relaInfo->pltRelaEntryCount = 0;
size_t relaTotalSize = 0;
size_t relTotalSize = 0;
size_t relaEntrySize = 0;
size_t relEntrySize = 0;
size_t pltTotalSize = 0;
for( const Elf64_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
for( const Elf_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
{
switch( dyn->d_tag )
{
case DT_RELA:
relaInfo->rela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
relaInfo->rela = ( Elf_Rela* ) dyn->d_un.d_ptr;
break;
case DT_REL:
relaInfo->rel = ( Elf_Rel* ) dyn->d_un.d_ptr;
break;
case DT_RELASZ:
relaTotalSize = dyn->d_un.d_val;
break;
case DT_RELSZ:
relTotalSize = dyn->d_un.d_val;
break;
case DT_RELAENT:
relaEntrySize = dyn->d_un.d_val;
break;
case DT_RELENT:
relEntrySize = dyn->d_un.d_val;
break;
case DT_JMPREL:
relaInfo->pltRela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
relaInfo->plt.addr = ( Elf_Addr ) dyn->d_un.d_ptr;
break;
case DT_PLTRELSZ:
@ -124,9 +294,8 @@ VOID SymCryptModuleFindRelocationInfo(
break;
case DT_PLTREL:
// Make sure PLT entries are DT_RELA entries and not DT_REL; we do not suppport
// the latter
SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA );
SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA || dyn->d_un.d_val == DT_REL );
relaInfo->pltRelAddendType = dyn->d_un.d_val;
break;
default:
@ -134,26 +303,45 @@ VOID SymCryptModuleFindRelocationInfo(
}
}
SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL );
SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf64_Rela ) );
SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
// Need to have at least one type of relocations.
SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL || relaInfo->rel != NULL );
relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
// On AMD64 there should not be a PLT section, because we can't currently handle AMD64 PLT
// relocations
if( relaInfo->pltRela != NULL)
if ( relaInfo->rela != NULL)
{
SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 && pltTotalSize % sizeof( Elf64_Rela ) == 0 );
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf64_Rela );
SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf_Rela ) );
SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
}
if ( relaInfo->rel != NULL )
{
SYMCRYPT_FIPS_ASSERT( relaInfo->rel != NULL );
SYMCRYPT_FIPS_ASSERT( relEntrySize == sizeof( Elf_Rel ) );
SYMCRYPT_FIPS_ASSERT( relTotalSize != 0 && relTotalSize % relEntrySize == 0 );
relaInfo->relEntryCount = relTotalSize / relEntrySize;
}
if( relaInfo->plt.addr != 0)
{
SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 );
// PLT relocations are either all rel or all rela.
if ( relaInfo->pltRelAddendType == DT_RELA )
{
SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rela ) == 0 );
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rela );
}
else
{
SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rel ) == 0 );
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rel );
}
}
}
size_t SymCryptModuleProcessSegmentWithRelocations(
_In_ const Elf64_Addr module_base,
_In_ const Elf64_Phdr* const programHeader,
_In_ const Elf64_Dyn* const dynStart,
_In_ const Elf64_Rela_Info* const relaInfo,
_In_ const Elf_Addr module_base,
_In_ const Elf_Phdr* const programHeader,
_In_ const Elf_Dyn* const dynStart,
_In_ const Elf_Rela_Info* const relaInfo,
_Inout_ SYMCRYPT_HMAC_SHA256_STATE* hmacState )
{
// The segment that contains relocations consists of the following sections, in this order:
@ -169,10 +357,19 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
// the module on disk is usually a different size than at runtime, so we cannot include it in
// our HMAC either.
//
// In arm (32 bit) relocations also exist in .text section so we'll do SymCryptModuleProcessSegmentWithRelocations
// on every section.
//
// FipsBoundaryOffset marks the start of the .data section, so we read from the start of the
// segment up to that offset.
size_t hashableSectionSize = SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
Elf64_Addr segmentStart = module_base + programHeader->p_vaddr;
size_t hashableSectionSize = programHeader->p_filesz;
Elf_Addr segmentStart = module_base + programHeader->p_vaddr;
// If the data section is in this segment then exclude that from being hashed.
if( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
{
hashableSectionSize = SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
}
BYTE* segmentCopy = SymCryptCallbackAlloc( hashableSectionSize );
SYMCRYPT_FIPS_ASSERT( segmentCopy != NULL );
@ -184,63 +381,103 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
// these relocations separately. We find the .dynamic section in the copied buffer based on
// its offset from the start of the section, which is calculated by subtracting the address
// of the start of the segment from the address of the .dynamic section in the segment.
Elf64_Off dynOffsetInBuffer = (Elf64_Addr) dynStart - (Elf64_Addr) segmentStart;
Elf64_Dyn* dynStartInBuffer = (Elf64_Dyn*) (segmentCopy + dynOffsetInBuffer);
Elf_Off dynOffsetInBuffer = (Elf_Addr) dynStart - (Elf_Addr) segmentStart;
Elf_Dyn* dynStartInBuffer = (Elf_Dyn*) (segmentCopy + dynOffsetInBuffer);
for( Elf64_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
// If this segment contains the dynamic section then we need to process the relocations in it.
if ((Elf_Addr)dynStart > segmentStart && (Elf_Addr)dynStart < segmentStart + hashableSectionSize)
{
// The following types of .dynamic entries have the module's base address added to
// their initial value
if( dyn->d_tag == DT_HASH ||
dyn->d_tag == DT_STRTAB ||
dyn->d_tag == DT_SYMTAB ||
dyn->d_tag == DT_RELA ||
dyn->d_tag == DT_GNU_HASH ||
dyn->d_tag == DT_VERSYM ||
dyn->d_tag == DT_PLTGOT ||
dyn->d_tag == DT_JMPREL)
for( Elf_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
{
dyn->d_un.d_val -= (Elf64_Xword) module_base;
// The following types of .dynamic entries have the module's base address added to
// their initial value
if( dyn->d_tag == DT_HASH ||
dyn->d_tag == DT_STRTAB ||
dyn->d_tag == DT_SYMTAB ||
dyn->d_tag == DT_RELA ||
dyn->d_tag == DT_GNU_HASH ||
dyn->d_tag == DT_VERSYM ||
dyn->d_tag == DT_PLTGOT ||
dyn->d_tag == DT_JMPREL ||
dyn->d_tag == DT_REL)
{
dyn->d_un.d_val -= module_base;
}
}
}
// Now we can process the normal relocations listed in the relocation table
for( size_t i = 0; i < relaInfo->relaEntryCount; ++i )
{
const Elf64_Rela* rela = relaInfo->rela + i;
const Elf_Rela* rela = relaInfo->rela + i;
// Find the relocation within the section. Note that for a shared object module,
// rela->r_offset is actually a virtual address. Relocations can occur within the .data
// section, which is outside our FIPS boundary, so any such relocations can be ignored.
Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
Elf_Off offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
if( offsetInBuffer > hashableSectionSize )
{
continue;
}
Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
SymCryptModuleUndoRelocation( module_base, target, rela );
SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rela->r_info ) );
}
// Process the GOT entries from the .rela.plt section. Same as process above, just
for( size_t i = 0; i < relaInfo->relEntryCount; ++i )
{
const Elf_Rel* rel = relaInfo->rel + i;
// Find the relocation within the section. Note that for a shared object module,
// rela->r_offset is actually a virtual address. Relocations can occur within the .data
// section, which is outside our FIPS boundary, so any such relocations can be ignored.
Elf_Off offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
if( offsetInBuffer > hashableSectionSize )
{
continue;
}
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rel->r_info ) );
}
// Process the GOT entries from the .rela.plt or .rel.plt section. Same as process above, just
// with a different table.
for( size_t i = 0; i < relaInfo->pltRelaEntryCount; ++i)
{
const Elf64_Rela* rela = relaInfo->pltRela + i;
Elf_Word type = 0;
Elf_Off offsetInBuffer = 0;
if (relaInfo->pltRelAddendType == DT_RELA)
{
const Elf_Rela* rela = relaInfo->plt.rela + i;
type = ELF_R_TYPE( rela->r_info );
offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
}
else
{
const Elf_Rel* rel = relaInfo->plt.rel + i;
type = ELF_R_TYPE( rel->r_info );
offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
}
Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
if( offsetInBuffer > hashableSectionSize )
{
continue;
}
Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
SymCryptModuleUndoRelocation( module_base, target, rela );
SymCryptModuleUndoRelocation( module_base, target, type );
}
#if SYMCRYPT_DEBUG_INTEGRITY
SymCryptHmacSha256AppendDbg( "Append after relocation adjust", hmacState, segmentCopy, hashableSectionSize );
#else
SymCryptHmacSha256Append( hmacState, segmentCopy, hashableSectionSize );
#endif
SymCryptCallbackFree( segmentCopy );
@ -248,9 +485,9 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
}
VOID SymCryptModuleDoHmac(
_In_ const Elf64_Addr module_base,
_In_ const Elf64_Dyn* const dynStart,
_In_ const Elf64_Rela_Info* const relaInfo )
_In_ const Elf_Addr module_base,
_In_ const Elf_Dyn* const dynStart,
_In_ const Elf_Rela_Info* const relaInfo )
{
SYMCRYPT_ERROR scError = SYMCRYPT_NO_ERROR;
SYMCRYPT_HMAC_SHA256_EXPANDED_KEY hmacKey;
@ -263,37 +500,20 @@ VOID SymCryptModuleDoHmac(
SymCryptHmacSha256Init( &hmacState, &hmacKey );
const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );
for( const Elf64_Phdr* programHeader = programHeaderStart;
for( const Elf_Phdr* programHeader = programHeaderStart;
programHeader->p_type == PT_LOAD; ++programHeader )
{
// Sometimes the virtual address of a segment is greater than its offset into the module
// file on disk. This means extra NULL bytes will be inserted into the module's memory
// space at runtime. Those bytes are not part of our FIPS boundary, so we skip over them
// and always start reading from the segment's virtual address
Elf64_Addr segmentStart = module_base + (Elf64_Off) programHeader->p_vaddr;
Elf_Addr segmentStart = module_base + (Elf_Off) programHeader->p_vaddr;
if( (programHeader->p_flags & PF_W) == PF_W &&
SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
{
// If we are processing the final writable segment (containing the .data section which
// marks the end of our FIPS boundary), then we need to reverse relocations in it
SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
relaInfo, &hmacState );
}
else
{
// For AMD64/ARM64, non-writeable segments do not contain relocations, so we can write
// them in their entirety without modification. Note that the size in memory of the
// section may be larger than the size on disk, but again, the additional size in memory
// is not part of our FIPS boundary
// For now we assume that if there are writable segments before the final writable
// segment that they also contain no relocations
SymCryptHmacSha256Append( &hmacState, (PCBYTE) segmentStart,
programHeader->p_filesz );
}
}
SymCryptHmacSha256Result( &hmacState, actualDigest );
@ -307,32 +527,32 @@ VOID SymCryptModuleVerifyIntegrity(void)
{
// Verify that our placeholder values were modified after compile time. The build script
// should have replaced the placeholder values with their expected values
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );
const Elf64_Addr module_base = (Elf64_Addr) SymCryptVolatileFipsHmacKey -
SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva );
const Elf_Addr module_base = (Elf_Addr) SymCryptVolatileFipsHmacKey -
SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva );
const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
SYMCRYPT_FIPS_ASSERT( memcmp(header->e_ident.ident.magic, ElfMagic, sizeof(ElfMagic)) == 0 );
SYMCRYPT_FIPS_ASSERT( header->e_type == ET_DYN );
SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 );
SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 || header->e_machine == EM_ARM );
SYMCRYPT_FIPS_ASSERT( header->e_version == EV_CURRENT );
SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf64_Ehdr) );
SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf64_Phdr) );
SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf_Ehdr) );
SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf_Phdr) );
const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );
Elf64_Rela_Info relaInfo = {};
Elf_Rela_Info relaInfo = {};
Elf64_Dyn* dynStart = NULL;
Elf_Dyn* dynStart = NULL;
for( unsigned int i = 0; i < header->e_phnum; ++i )
{
const Elf64_Phdr* programHeader = programHeaderStart + i;
const Elf_Phdr* programHeader = programHeaderStart + i;
if( programHeader->p_type == PT_DYNAMIC )
{
dynStart = (Elf64_Dyn*) (module_base + (Elf64_Off) programHeader->p_vaddr);
dynStart = (Elf_Dyn*) (module_base + (Elf_Off) programHeader->p_vaddr);
SymCryptModuleFindRelocationInfo( dynStart, &relaInfo );

Просмотреть файл

@ -1,21 +1,22 @@
//
// integrity.h
// FIPS 140-3 integrity verification header for ELF binaries
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
VOID SymCryptModuleVerifyIntegrity(void);
//
// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
// HMAC-SHA256. The module must have been postprocessed after compilation using
// process_fips_module.py. The integrity check finds the module's base address in memory by
// subtracting the relative virtual address of a known variable from its actual address in memory.
// It then uses the ELF header to find all the loadable segments in the module and calculate the
// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
// match the contents of the file on disk prior to relocation.
//
// If the integrity check fails for any reason, the module will fastfail, crashing the process,
// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
//
//
// integrity.h
// FIPS 140-3 integrity verification header for ELF binaries
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
VOID SymCryptModuleVerifyIntegrity(void);
//
// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
// HMAC-SHA256. The module must have been postprocessed after compilation using
// process_fips_module.py. The integrity check finds the module's base address in memory by
// subtracting the relative virtual address of a known variable from its actual address in memory.
// It then uses the ELF header to find all the loadable segments in the module and calculate the
// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
// match the contents of the file on disk prior to relocation.
//
// If the integrity check fails for any reason, the module will fastfail, crashing the process,
// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
//

Просмотреть файл

@ -3,8 +3,8 @@ set(SOURCES
../common/optional/rngforkdetection.c
../common/optional/rngsecureurandom.c)
# Enable integrity verification if compiling for AMD64 or ARM64
if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM64")
# Enable integrity verification if compiling for AMD64 or ARM64 or ARM
if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM")
list(APPEND SOURCES ../common/integrity.c)
else()
list(APPEND SOURCES ../common/nointegrity.c)

Просмотреть файл

@ -15,7 +15,7 @@ import subprocess
import sys
from typing import List
ARCH_CMAKE = ("x86", "amd64", "arm64")
ARCH_CMAKE = ("x86", "amd64", "arm64", "arm")
CONFIG_CMAKE = ("Debug", "Release", "Sanitize")
ARCH_MSBUILD = ("x86", "amd64", "arm64")
@ -35,8 +35,8 @@ def get_normalized_host_arch() -> str:
normalized_arch = "amd64"
elif re.fullmatch("ARM64|aarch64", host_arch):
normalized_arch = "arm64"
# No support for ARM32 right now
elif re.fullmatch("ARM32|aarch32", host_arch):
normalized_arch = "arm"
if not normalized_arch:
print("Unrecognized host architecture " + host_arch, file = sys.stderr)
@ -82,7 +82,8 @@ def configure_cmake(args : argparse.Namespace) -> None:
cmake_args.append("x64")
elif args.arch == "arm64":
cmake_args.append("arm64")
elif args.arch == "arm":
cmake_args.append("arm")
# No support for ARM32 right now
if args.host_arch != args.arch:

Просмотреть файл

@ -197,7 +197,7 @@ def main() -> None:
parser = argparse.ArgumentParser(description = "Packaging helper script for SymCrypt.")
parser.add_argument("build_dir", type = pathlib.Path, help = "Build output directory.")
parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64"))
parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64", "arm"))
parser.add_argument("config", type = str, help = "Build configuration.", choices = ["Debug", "Release", "Sanitize"])
parser.add_argument("module_name", type = str, help = "Name of the module to package.")
parser.add_argument("release_dir", type = pathlib.Path, help = "Directory to place the release in.")

Просмотреть файл

@ -21,7 +21,7 @@ import struct
from elftools.elf.constants import P_FLAGS
from elftools.elf.elffile import ELFFile
from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64
from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64, ENUM_RELOC_TYPE_ARM
# Names of global constants in the FIPS module that need to be replaced
KEY_NAME = "SymCryptVolatileFipsHmacKey"
@ -33,9 +33,27 @@ DIGEST_NAME = "SymCryptVolatileFipsHmacDigest"
CHAR_FORMAT_SPECIFIER = "s"
QWORD_FORMAT_SPECIFIER = "Q"
QWORD_BYTE_SIZE = struct.calcsize(QWORD_FORMAT_SPECIFIER)
DWORD_FORMAT_SPECIFIER = "I"
DWORD_BYTE_SIZE = struct.calcsize(DWORD_FORMAT_SPECIFIER)
RELOCATION_TYPE_SIZES = {
ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]: {
'size': QWORD_BYTE_SIZE,
'format': QWORD_FORMAT_SPECIFIER,
},
ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]: {
'size': QWORD_BYTE_SIZE,
'format': QWORD_FORMAT_SPECIFIER,
},
ENUM_RELOC_TYPE_ARM["R_ARM_JUMP_SLOT"]: {
'size': DWORD_BYTE_SIZE,
'format': DWORD_FORMAT_SPECIFIER,
},
}
# Must match the placeholder values in integrity.c
PLACEHOLDER_VALUE = struct.pack(QWORD_FORMAT_SPECIFIER, 0x8BADF00D)
PLACEHOLDER_VALUE_64BIT = struct.pack(QWORD_FORMAT_SPECIFIER, 0x4BADF00D8BADF00D)
PLACEHOLDER_VALUE_32BIT = struct.pack(DWORD_FORMAT_SPECIFIER, 0x8BADF00D)
PLACEHOLDER_ARRAY = bytes((
0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,
0x08, 0x4E, 0x3F, 0xE7, 0x60, 0x7E, 0x4F, 0x08,
@ -136,7 +154,8 @@ class ElfFileValueProxy(object):
# The .data() method returns a bytes object. We can't use it to write back to the original
# buffer, so we need to find the appropriate section within the stream using sh_offset and
# write to that.
logging.debug("Writing {} to offset {}".format(value.hex(), hex(self.offset)))
# Note self.section.stream == self.elf_file.stream.
logging.debug("Changing {} writing {} to offset {}".format(self.name, value.hex(), hex(self.offset)))
self.section.stream.seek(self.offset)
self.section.stream.write(value)
@ -145,7 +164,7 @@ class ElfFileValueProxy(object):
assert(len(new_value) == self.length)
if self.name is not None:
logging.debug("Changing {} value".format(self.name))
logging.debug("Changing {} value to {}".format(self.name, *args))
self.value = new_value
@ -156,6 +175,36 @@ def log_value(var):
hex(var.vaddr),
var.value.hex()))
def dbg_dump_hex(data, address=0, file=None):
digits = "0123456789abcdef"
char_per_line = 16
remaining = len(data)
for line_pos in range(0, len(data), char_per_line):
chars_line = char_per_line if remaining > char_per_line else remaining
line = "{:08x} ".format(address)
address += chars_line
for i in range(char_per_line):
if i < chars_line:
line += "{:02x}".format(data[line_pos + i])
if i == 7:
line += ":"
else:
line += " "
else:
line += " "
line += " "
for i in range(chars_line):
if data[line_pos + i] < 32 or data[line_pos + i] > 126:
line += "."
else:
line += chr(data[line_pos + i])
print(line, file=file)
remaining -= char_per_line
def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_path = None):
"""
Performs HMAC-SHA256 on module contents and writes it back to the module buffer
@ -164,6 +213,10 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
module_bytes = bytearray()
last_segment_offset = -1
dump_file = None
if dump_file_path is not None:
dump_file = open(dump_file_path + '.txt', "w")
for (index, segment) in enumerate(loadable_segments):
segment_hashable_length = 0
@ -175,9 +228,13 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
if segment["p_offset"] + segment["p_filesz"] > data_section_offset:
segment_hashable_length = data_section_offset - segment["p_offset"]
module_bytes += segment.data()[:segment_hashable_length]
print("\nHMAC append: off {:08x} past data segment size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
else:
module_bytes += segment.data()
segment_hashable_length = len(segment.data())
print("\nHMAC append: off {:08x} normal size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
dbg_dump_hex(segment.data()[:segment_hashable_length], file=dump_file)
logging.info("Segment {}: {} - {}".format(
index,
@ -186,7 +243,8 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
last_segment_offset = segment["p_offset"]
if dump_file_path is not None:
if dump_file is not None:
dump_file.close()
with open(dump_file_path, "wb") as dump_file:
dump_file.write(module_bytes)
@ -265,29 +323,34 @@ def overwrite_jump_slots(elf_file, new_value):
(vaddr, original value), so that they can be reset after the HMAC digest is calculated.
"""
rela_plt_section = elf_file.get_section_by_name(".rela.plt")
# Jump slot relocations always live in the .rela.plt section. If there is no .rela.plt section,
# there will be no jump slot relocations
if rela_plt_section is None:
return []
original_jump_slot_values = []
for relocation in rela_plt_section.iter_relocations():
# Jump slot relocations live in .rela.plt or .rel.plt sections.
rela_plt_section = elf_file.get_section_by_name(".rela.plt")
rel_plt_section = elf_file.get_section_by_name(".rel.plt")
relocations = []
if rela_plt_section is not None:
relocations += rela_plt_section.iter_relocations()
if rel_plt_section is not None:
relocations += rel_plt_section.iter_relocations()
for relocation in relocations:
relocation_type = relocation["r_info_type"]
if (relocation_type == ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]
or relocation_type == ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]):
logging.debug("Found relocation type {}".format(relocation_type))
if relocation_type in RELOCATION_TYPE_SIZES:
# Note that r_offset is actually a virtual address
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"], QWORD_BYTE_SIZE)
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"],
RELOCATION_TYPE_SIZES[relocation_type]["size"])
original_value_int = struct.unpack(QWORD_FORMAT_SPECIFIER, relocation_value.value)[0]
original_jump_slot_values.append((relocation_value.vaddr, original_value_int))
original_value_int = struct.unpack(RELOCATION_TYPE_SIZES[relocation_type]["format"], relocation_value.value)[0]
original_jump_slot_values.append((relocation_value.vaddr, original_value_int, relocation_type))
logging.debug("Updating relocation at {} with original value {}".format(
hex(relocation_value.offset), hex(original_value_int)))
relocation_value.set_value(QWORD_FORMAT_SPECIFIER, new_value)
relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], new_value)
else:
logging.warning("Unknown relocation type {} found at offset {}".format(
relocation_type, relocation["r_offset"]))
@ -301,13 +364,14 @@ def reset_jump_slots(elf_file, original_jump_slot_values):
lazy binding still works.
"""
for vaddr, original_value in original_jump_slot_values:
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr, QWORD_BYTE_SIZE)
for vaddr, original_value, relocation_type in original_jump_slot_values:
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr,
RELOCATION_TYPE_SIZES[relocation_type]["size"])
logging.debug("Resetting relocation at {} to original value {}".format(
hex(vaddr), hex(original_value)))
relocation_value.set_value(QWORD_FORMAT_SPECIFIER, original_value)
relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], original_value)
def main():
"""
@ -335,7 +399,7 @@ def main():
elf_file = ELFFile(buffer_stream)
arch = elf_file["e_machine"]
if not arch == "EM_X86_64" and not arch == "EM_AARCH64":
if not arch == "EM_X86_64" and not arch == "EM_AARCH64" and not arch == "EM_ARM":
logging.error("Unsupported architecture {}".format(arch))
raise RuntimeError
@ -354,20 +418,32 @@ def main():
# Find the HMAC key relative virtual address placeholder and replace it with
# the actual relative virtual address of the HMAC key
key_rva_variable = ElfFileValueProxy.from_symbol_name(elf_file, KEY_RVA_NAME)
assert(key_rva_variable.value == PLACEHOLDER_VALUE)
if arch == "EM_ARM":
assert(key_rva_variable.value == PLACEHOLDER_VALUE_32BIT)
else:
assert(key_rva_variable.value == PLACEHOLDER_VALUE_64BIT)
key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
if arch == "EM_ARM":
key_rva_variable.set_value(DWORD_FORMAT_SPECIFIER, key_variable.vaddr)
else:
key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
log_value(key_rva_variable)
# Find the FIPS module boundary placeholder and replace it with the our actual
# FIPS module boundary, which we have defined to be the start of the .data section
fips_boundary_variable = ElfFileValueProxy.from_symbol_name(elf_file, BOUNDARY_OFFSET_NAME)
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE)
if arch == "EM_ARM":
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_32BIT)
else:
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_64BIT)
data_section = elf_file.get_section_by_name(".data")
data_section_offset = data_section["sh_offset"]
fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
if arch == "EM_ARM":
fips_boundary_variable.set_value(DWORD_FORMAT_SPECIFIER, data_section_offset)
else:
fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
log_value(fips_boundary_variable)
# Find the HMAC digest placeholder, HMAC the loadable segments of the module, and replace

Просмотреть файл

@ -20,9 +20,9 @@ appropriate to enable this effort.
Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
file. If the processed symcryptasm file includes other files via the INCLUDE directive, the contents
of the included files are merged at their point of inclusion to generate a single expanded symcryptasm
file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which
do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to
of the included files are merged at their point of inclusion to generate a single expanded symcryptasm
file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which
do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to
the source file.
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
@ -78,7 +78,7 @@ A leaf function ends with the FUNCTION_END macro, which also takes the function
At the function start a prologue is generated which arranges the arguments appropriately in
registers, and saves non-volatile registers that have been requested to be used.
At the function end an epilogue is generated with restores the non-volatile registers and returns.
At the function end an epilogue is generated which restores the non-volatile registers and returns.
A nested function (a function which does call another function) is specified similarly, only using
@ -112,14 +112,14 @@ prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this ca
We currently do not support nested mul functions, as we have none of them.
Stack layout for amd64 is as follows. Xmm registers are volatile and not saved on Linux.
Memory Exists if
|-------------------|
| |
| Shadow space |
| |
| Shadow space |
| |
|-------------------|
| Return address |
| Return address |
|-------------------|
| Non-volatile |
| general purpose | reg_count > volatile_registers
@ -149,6 +149,11 @@ currently support, these registers are volatile so do not need any special handl
X_0 is always the result register and the first argument passed to the function.
X_1-X_7 are the arguments 2-8 passed to the function
### arm (32) ###
We allow the registers r0-r12 to be addressed as r13-r15 are special registers that we cannot use as general purpose registers.
As r12 is volatile in a leaf function, it should be used in preference to r4, to avoid spilling/restoring a register.
"""
import re
@ -216,6 +221,24 @@ ARM64_R28 = Register("x28", "w28")
ARM64_R29 = Register("x29", "w29") # Frame Pointer
ARM64_R30 = Register("x30", "w30") # Link Register
# arm32 registers
ARM32_R0 = Register(None, "r0")
ARM32_R1 = Register(None, "r1")
ARM32_R2 = Register(None, "r2")
ARM32_R3 = Register(None, "r3")
ARM32_R4 = Register(None, "r4")
ARM32_R5 = Register(None, "r5")
ARM32_R6 = Register(None, "r6")
ARM32_R7 = Register(None, "r7")
ARM32_R8 = Register(None, "r8")
ARM32_R9 = Register(None, "r9")
ARM32_R10 = Register(None, "r10")
ARM32_R11 = Register(None, "r11")
ARM32_R12 = Register(None, "r12")
ARM32_R13 = Register(None, "r13")
ARM32_R14 = Register(None, "r14")
ARM32_R15 = Register(None, "r15")
class CallingConvention:
"""A class to represent calling conventions"""
@ -306,7 +329,7 @@ def calc_amd64_stack_allocation_sizes(self, reg_count, stack_alloc_size, xmm_reg
aligned_on_16B = True
# Calculate the space needed to save Xmm registers on the stack
saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)
saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)
xmm_save_size = 16 * saved_reg_xmm
if xmm_save_size > 0 and not aligned_on_16B:
xmm_save_size += 8
@ -337,7 +360,7 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_re
prologue = "\n"
# Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space.
# Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space.
# Each of the sections other than general purpose registers are aligned on 16B boundary and some of them
# may include an 8B padding in their size.
reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
@ -380,12 +403,12 @@ def gen_prologue_amd64_msft_nested(self, arg_count, reg_count, stack_alloc_size,
return gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)
def gen_epilogue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
epilogue = "\n"
reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
self, reg_count, stack_alloc_size, xmm_reg_count, nested)
# Restore non-volatile Xmm registers
if xmm_save_size > 0:
for i in range(6, xmm_reg_count):
@ -454,7 +477,7 @@ MAPPING_AMD64_SYSTEMV = {
}
def gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = False):
# Calculate the sizes required for each section
# We need to call with xmm_reg_count=0 to avoid allocation/alignment for saving Xmm registers since they're
# volatile for this calling convention.
@ -491,7 +514,7 @@ def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count, stack_alloc_si
return gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)
def gen_epilogue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
epilogue = ""
# Calculate the sizes required for each section
@ -601,6 +624,27 @@ MAPPING_ARM64_ARM64ECMSFT = {
# R28 is reserved in ARM64EC
}
# ARM32 calling convention
# A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP (and r9 in PCS variants that designate r9 as v6).
MAPPING_ARM32_AAPCS32 = {
0: ARM32_R0, # Argument 1 / Result register / volatile
1: ARM32_R1, # Argument 2 / Result register / volatile
2: ARM32_R2, # Argument 3 / volatile
3: ARM32_R3, # Argument 4 / volatile
4: ARM32_R4, # non-volatile
5: ARM32_R5, # non-volatile
6: ARM32_R6, # non-volatile
7: ARM32_R7, # non-volatile
8: ARM32_R8, # non-volatile
9: ARM32_R9, # reserved for something
10:ARM32_R10, # non-volatile
11:ARM32_R11, # FP non-volatile
12:ARM32_R12, # volatile for leaf functions
13:ARM32_R13, # SP
14:ARM32_R14, # LR
15:ARM32_R15, # PC
}
def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
@ -621,6 +665,38 @@ def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
return epilogue
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
assert(not stack_alloc_size and not xmm_reg_count)
prologue = ""
# Always spill at least 1 register (LR).
# LR needs to be saved for nested functions but for now we'll store it always
# since we don't differentiate between nested and leaf functions for arm yet.
registers_to_spill = []
logging.info(f"prologue {reg_count} > {self.volatile_registers}")
if reg_count > self.volatile_registers:
for i in range(self.volatile_registers, reg_count):
registers_to_spill.append('r%s' % i)
# Stack pointer is word 4B aligned
# required_stack_space = 4 * len(registers_to_spill)
registers_to_spill.append('lr')
prologue += "push {" + ",".join(registers_to_spill) + "}\n"
return prologue
def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
assert(not stack_alloc_size and not xmm_reg_count)
epilogue = ""
registers_to_spill = []
logging.info(f"epilogue {reg_count} > {self.volatile_registers}")
if reg_count > self.volatile_registers:
for i in range(self.volatile_registers, reg_count):
registers_to_spill.append('r%s' % i)
# Stack pointer is word 4B aligned
# required_stack_space = 4 * len(registers_to_spill)
registers_to_spill.append('pc')
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
return epilogue
def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
prologue = ""
@ -677,6 +753,10 @@ CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
gen_prologue_aapcs32, gen_epilogue_aapcs32, gen_get_memslot_offset_arm64)
def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
defines = ""
if architecture == "amd64":
@ -687,6 +767,8 @@ def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True
elif architecture == "arm64":
prefix64 = "X_"
prefix32 = "W_"
elif architecture == "arm32":
return defines
else:
logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
exit(1)
@ -736,8 +818,8 @@ MASM_FUNCTION_TEMPLATE = "%s, _TEXT\n"
# ARMASM64 function macros must be correctly indented
ARMASM64_FUNCTION_TEMPLATE = " %s\n"
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
GAS_FUNCTION_END = ""
GAS_FUNCTION_ENTRY = "%s: .global %s\n.type %s, %%function\n// .func %s\n"
GAS_FUNCTION_END = "// .endfunc // %s"
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested):
function_entry = None
@ -756,7 +838,7 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r
elif assembler == "armasm64":
function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
elif assembler == "gas":
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name, function_name, function_name)
else:
logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
exit(1)
@ -784,7 +866,7 @@ def generate_epilogue(assembler, calling_convention, function_name, arg_count, r
elif assembler == "armasm64":
function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
elif assembler == "gas":
function_end = GAS_FUNCTION_END
function_end = GAS_FUNCTION_END % function_name
else:
logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
exit(1)
@ -928,11 +1010,11 @@ class ProcessingStateMachine:
else:
self.calling_convention = self.normal_calling_convention
return generate_prologue(self.assembler,
self.calling_convention,
self.function_name,
self.arg_count,
self.reg_count,
return generate_prologue(self.assembler,
self.calling_convention,
self.function_name,
self.arg_count,
self.reg_count,
self.stack_alloc_size,
self.xmm_reg_count,
self.is_nested_function
@ -1115,6 +1197,10 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
mul_calling_convention = None
nested_calling_convention = None
elif architecture == "arm" and calling_convention == "aapcs32":
normal_calling_convention = CALLING_CONVENTION_ARM32_AAPCS32
mul_calling_convention = None
nested_calling_convention = None
elif assembler == "armasm64":
if architecture == "arm64" and calling_convention == "aapcs64":
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
@ -1149,7 +1235,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
# expand_files() is called recursively when a .symcryptasm file contains an INCLUDE directive,
# except for the first call here where we're starting to process the input source file
# as if it was included by some other file.
# as if it was included by some other file.
expanded_file, infile_has_includes = expand_files(infilename, 0, "")
expanded_lines.extend(expanded_file)
@ -1163,17 +1249,20 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
with open(outfilename, "w") as outfile:
for line_num, line in enumerate(expanded_lines):
processed_line = file_processing_state.process_line(line, line_num)
# logging.info("processed line: %s" % processed_line)
outfile.write(processed_line)
if __name__ == "__main__":
import argparse
# logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64', 'arm'])
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec', 'aapcs32'])
parser.add_argument('inputfile', type=str, help='Path to input file')
parser.add_argument('outputfile', type=str, help='Path to output file')
args = parser.parse_args()
process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)
logging.info("Done")

Просмотреть файл

@ -18,6 +18,6 @@ target_link_libraries(symcryptunittest symcryptunittest_lib symcrypt_common atom
# Export oe_sgx_get_additional_host_entropy from the executable so that if we dynamically load
# oe module, the linker can find the version which is locally defined in the executable
target_link_options(symcryptunittest PRIVATE
target_link_options(symcryptunittest PRIVATE
-Wl,--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/dynamic-list.ver
)

Просмотреть файл

@ -97,5 +97,11 @@ else()
add_compile_options(-DINCLUDE_IMPL_RSA32=0)
endif()
if(SYMCRYPT_TARGET_ARCH STREQUAL "ARM" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
# Hide warning due to abi change.
set_source_files_properties(kat.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
set_source_files_properties(perf.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
endif()
add_library(symcryptunittest_lib STATIC ${SOURCES})
set_target_properties(symcryptunittest_lib PROPERTIES PREFIX "")