зеркало из https://github.com/microsoft/SymCrypt.git
Merged PR 9576163: Build SymCrypt with gcc-arm-linux-gnueabihf
This commit is contained in:
Родитель
17360b237b
Коммит
b6a267815e
|
@ -180,4 +180,18 @@ extends:
|
|||
config: 'Release'
|
||||
cc: 'clang'
|
||||
cxx: 'clang++'
|
||||
additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
|
||||
additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
|
||||
- template: .pipelines/templates/build-linux.yml@self
|
||||
parameters:
|
||||
arch: 'ARM'
|
||||
config: 'Debug'
|
||||
cc: 'gcc'
|
||||
cxx: 'g++'
|
||||
additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'
|
||||
- template: .pipelines/templates/build-linux.yml@self
|
||||
parameters:
|
||||
arch: 'ARM'
|
||||
config: 'Release'
|
||||
cc: 'gcc'
|
||||
cxx: 'g++'
|
||||
additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'
|
|
@ -74,6 +74,12 @@ jobs:
|
|||
apt-get install -y binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu qemu-user
|
||||
displayName: 'Install arm64 cross-compilation tools'
|
||||
|
||||
- ${{ if eq(parameters.arch, 'ARM') }}:
|
||||
- script: |
|
||||
apt-get update
|
||||
apt-get install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf qemu-user
|
||||
displayName: 'Install arm64 cross-compilation tools'
|
||||
|
||||
- task: PipAuthenticate@1
|
||||
inputs:
|
||||
artifactFeeds: 'OS/SymCrypt_PublicPackages'
|
||||
|
@ -92,23 +98,24 @@ jobs:
|
|||
|
||||
- ${{ if ne(parameters.skipTests, true) }}:
|
||||
- ${{ if ne(parameters.arch, 'ARM64') }}:
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run unit tests'
|
||||
inputs:
|
||||
scriptSource: 'filePath'
|
||||
scriptPath: scripts/test.py
|
||||
arguments: 'bin noperftests'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- ${{ if ne(parameters.config, 'Sanitize') }}:
|
||||
- ${{ if ne(parameters.arch, 'ARM') }}:
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run dynamic unit tests'
|
||||
displayName: 'Run unit tests'
|
||||
inputs:
|
||||
scriptSource: 'filePath'
|
||||
scriptPath: scripts/test.py
|
||||
arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
|
||||
arguments: 'bin noperftests'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- ${{ if ne(parameters.config, 'Sanitize') }}:
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run dynamic unit tests'
|
||||
inputs:
|
||||
scriptSource: 'filePath'
|
||||
scriptPath: scripts/test.py
|
||||
arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- ${{ if eq(parameters.arch, 'AMD64') }}:
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run unit tests (test YMM save/restore)'
|
||||
|
@ -134,6 +141,23 @@ jobs:
|
|||
scriptPath: scripts/test.py
|
||||
arguments: '--emulator qemu-aarch64 --emulator-lib-dir /usr/aarch64-linux-gnu/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- ${{ if eq(parameters.arch, 'ARM') }}:
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run unit tests'
|
||||
inputs:
|
||||
scriptSource: 'filePath'
|
||||
scriptPath: scripts/test.py
|
||||
arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin noperftests +symcrypt -dh -dsa -rsa'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- task: PythonScript@0
|
||||
displayName: 'Run dynamic unit tests'
|
||||
inputs:
|
||||
scriptSource: 'filePath'
|
||||
scriptPath: scripts/test.py
|
||||
arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
|
||||
workingDirectory: $(Build.SourcesDirectory)
|
||||
|
||||
- task: PythonScript@0
|
||||
displayName: 'Package build output'
|
||||
|
|
23
BUILD.md
23
BUILD.md
|
@ -79,4 +79,25 @@ Building the SymCrypt unit tests with MSBuild requires access to the msbignum an
|
|||
be released externally due to licensing restrictions. If you wish to build directly with MSBuild, bypassing the Python
|
||||
helper script, you can run `msbuild /p:Platform=<platform> /p:Architecture=<arch> symcrypt.sln`. Note that Python is
|
||||
still required for translating SymCryptAsm. The output directory for MSBuild is always `build\bin`, and all compiled
|
||||
outputs are placed in this directory.
|
||||
outputs are placed in this directory.
|
||||
|
||||
## Building Linux targets
|
||||
|
||||
Requires the following packages on debian-based systems to build:
|
||||
```
|
||||
apt-get -y install --no-install-recommends \
|
||||
cmake \
|
||||
python3-pyelftools \ # integrity
|
||||
gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf # for arm
|
||||
```
|
||||
|
||||
And for running the test:
|
||||
```
|
||||
apt-get -y install --no-install-recommends qemu-user
|
||||
```
|
||||
|
||||
To build and test for example for arm:
|
||||
```
|
||||
python3 scripts/build.py cmake --arch arm --toolchain cmake-configs/Toolchain-GCC-ARM.cmake bin_arm
|
||||
qemu-arm -L /usr/arm-linux-gnueabihf/ ./bin_arm/exe/symcryptunittest -rsa -dsa -dh -ec -int -mod dynamic:bin_arm/module/generic/libsymcrypt.so
|
||||
```
|
|
@ -52,6 +52,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|||
# GCC complains about implicit casting between ASIMD registers (i.e. uint8x16_t -> uint64x2_t) by default,
|
||||
# whereas clang and MSVC do not. Setting -flax-vector-conversions to build Arm64 intrinsics code with GCC.
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
|
||||
elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a+neon-vfpv4 -flax-vector-conversions -mfpu=neon")
|
||||
endif()
|
||||
|
||||
# add_compile_options(-Wall)
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
# This toolchain file configures ARM cross-compilation with GCC.
|
||||
# To use the toolchain file, run cmake .. --toolchain="../cmake-configs/Toolchain-GCC-ARM.cmake"
|
||||
# Note: the --toolchain argument is only available in CMake v3.21 and newer. Prior to that version, you will have to
|
||||
# specify -DCMAKE_TOOLCHAIN_FILE instead, which may cause a spurious warning about an unused variable.
|
||||
|
||||
set(CMAKE_SYSTEM_PROCESSOR ARM)
|
||||
set(TARGET_TRIPLE arm-linux-gnueabihf)
|
||||
|
||||
set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
|
||||
|
||||
set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
|
||||
set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
|
||||
|
||||
if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM|arm")
|
||||
# C/C++ toolchain (installed on Ubuntu using apt-get gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf)
|
||||
set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})
|
||||
|
||||
find_path(CXX_CROSS_INCLUDE_DIR NAMES ${TARGET_TRIPLE} PATHS /usr/${TARGET_TRIPLE}/include/ /usr/${TARGET_TRIPLE}/include/c++/ PATH_SUFFIXES 15 14 13 12 11 10 9 8 7 6 5 NO_DEFAULT_PATH)
|
||||
add_compile_options(-I${CXX_CROSS_INCLUDE_DIR}/${TARGET_TRIPLE})
|
||||
endif()
|
|
@ -1568,7 +1568,12 @@ typedef union _SYMCRYPT_GCM_SUPPORTED_BLOCKCIPHER_KEYS
|
|||
|
||||
|
||||
#if SYMCRYPT_CPU_ARM
|
||||
#include <arm_neon.h>
|
||||
#include <arm_neon.h>
|
||||
#if SYMCRYPT_GNUC
|
||||
#define __n128 uint32x4_t
|
||||
#define __n64 uint64x1_t
|
||||
#endif
|
||||
|
||||
#elif SYMCRYPT_CPU_ARM64
|
||||
|
||||
#if SYMCRYPT_MS_VC
|
||||
|
|
|
@ -125,10 +125,12 @@ function(process_cppasm filepath outformat archdefine)
|
|||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
|
||||
message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
|
||||
endif()
|
||||
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
|
||||
get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
|
||||
string(TOUPPER ${outformat} outformatupper)
|
||||
string(TOUPPER ${archdefine} archdefineupper)
|
||||
|
@ -179,10 +181,10 @@ function(process_symcryptasm filepath outformat archdefine callingconvention)
|
|||
if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
|
||||
endif()
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
|
||||
if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
|
||||
endif()
|
||||
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
|
||||
if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec) AND (NOT callingconvention STREQUAL aapcs32))
|
||||
message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
|
||||
endif()
|
||||
get_filename_component(rootpath ${filepath} DIRECTORY)
|
||||
|
@ -229,8 +231,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
|
|||
amd64/fdef369_asm-masm.asm
|
||||
amd64/fdef_mulx-masm.asm
|
||||
amd64/wipe-masm.asm
|
||||
amd64/sha256xmm_asm-masm.asm
|
||||
amd64/sha256ymm_asm-masm.asm
|
||||
amd64/sha256xmm_asm-masm.asm
|
||||
amd64/sha256ymm_asm-masm.asm
|
||||
amd64/sha512ymm_asm-masm.asm
|
||||
amd64/sha512ymm_avx512vl_asm-masm.asm)
|
||||
set_source_files_properties(
|
||||
|
@ -239,8 +241,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
|
|||
amd64/fdef369_asm-masm.asm
|
||||
amd64/fdef_mulx-masm.asm
|
||||
amd64/wipe-masm.asm
|
||||
amd64/sha256xmm_asm-masm.asm
|
||||
amd64/sha256ymm_asm-masm.asm
|
||||
amd64/sha256xmm_asm-masm.asm
|
||||
amd64/sha256ymm_asm-masm.asm
|
||||
amd64/sha512ymm_asm-masm.asm
|
||||
amd64/sha512ymm_avx512vl_asm-masm.asm
|
||||
PROPERTY LANGUAGE ASM_MASM)
|
||||
|
@ -294,7 +296,7 @@ elseif(SYMCRYPT_USE_ASM) # Linux
|
|||
amd64/sha512ymm_asm-gas.asm
|
||||
amd64/sha512ymm_avx512vl_asm-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM64")
|
||||
elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM64")
|
||||
process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
|
||||
process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
|
||||
process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
|
||||
|
@ -308,6 +310,22 @@ elseif(SYMCRYPT_USE_ASM) # Linux
|
|||
arm64/fdef369_asm-gas.asm
|
||||
arm64/wipe-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM")
|
||||
process_symcryptasm(arm/aesasm.symcryptasm gas arm aapcs32)
|
||||
process_symcryptasm(arm/fdef_asm.symcryptasm gas arm aapcs32)
|
||||
process_symcryptasm(arm/wipe.symcryptasm gas arm aapcs32)
|
||||
|
||||
list(APPEND SOURCES_COMMON
|
||||
arm/aesasm-gas.asm
|
||||
arm/fdef_asm-gas.asm
|
||||
arm/wipe-gas.asm)
|
||||
set_source_files_properties(
|
||||
arm/aesasm-gas.asm
|
||||
arm/fdef_asm-gas.asm
|
||||
arm/wipe-gas.asm
|
||||
PROPERTY LANGUAGE ASM)
|
||||
set_source_files_properties(
|
||||
arm/fdef_asm-gas.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/arm)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
@ -0,0 +1,977 @@
|
|||
//
|
||||
// AesAsm.cppasm Assembler code for fast AES on ARM
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
// This code is derived from the AMD64 version of the AesFast
|
||||
// implementation, developed by Niels Ferguson. For questions
|
||||
// about the ARM specifics, contact Aaron Giles.
|
||||
//
|
||||
|
||||
// #include "kxarm.h"
|
||||
|
||||
#include "symcryptasm_shared.cppasm"
|
||||
|
||||
#if SYMCRYPT_DEBUG
|
||||
SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API * 65536) + SYMCRYPT_CODE_VERSION_MINOR))
|
||||
SET(SYMCRYPT_MAGIC_CONSTANT, HEX(53316D76) + SYMCRYPT_CODE_VERSION) // 0x53316D76 == 'S1mv'
|
||||
|
||||
MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
|
||||
ldr temp1, [ptr, #offset]
|
||||
subs temp1, temp1, ptr
|
||||
//mov32 temp2, =SYMCRYPT_MAGIC_CONSTANT
|
||||
ldr temp2, =SYMCRYPT_MAGIC_CONSTANT
|
||||
cmp temp1, temp2
|
||||
beq check_magic_label
|
||||
//mov32 r0, 0x6d616763 // 'magc'
|
||||
ldr r0, =0x6d616763 // 'magc'
|
||||
bl SymCryptFatal
|
||||
check_magic_label:
|
||||
MACRO_END()
|
||||
#else
|
||||
MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
|
||||
MACRO_END()
|
||||
#endif
|
||||
|
||||
|
||||
// TTL "Advanced Encryption Standard (AES)"
|
||||
|
||||
//
|
||||
// typedef SYMCRYPT_ALIGN_STRUCT _SYMCRYPT_AES_EXPANDED_KEY {
|
||||
// SYMCRYPT_ALIGN BYTE RoundKey[29][4][4];
|
||||
// Round keys, first the encryption round keys in encryption order,
|
||||
// followed by the decryption round keys in decryption order.
|
||||
// The first decryption round key is the last encryption round key.
|
||||
// AES-256 has 14 rounds and thus 15 round keys for encryption and 15
|
||||
// for decryption. As they share one round key, we need room for 29.
|
||||
// BYTE (*lastEncRoundKey)[4][4]; Pointer to last encryption round key
|
||||
// also the first round key for decryption
|
||||
// BYTE (*lastDecRoundKey)[4][4]; Pointer to last decryption round key.
|
||||
//
|
||||
// SYMCRYPT_MAGIC_FIELD
|
||||
// } SYMCRYPT_AES_EXPANDED_KEY, *PSYMCRYPT_AES_EXPANDED_KEY
|
||||
//
|
||||
|
||||
#define SYMCRYPT_AES_EXPANDED_KEY_RoundKey (0)
|
||||
#define SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey (29*4*4)
|
||||
#define SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey (29*4*4+4)
|
||||
|
||||
#if SYMCRYPT_DEBUG
|
||||
#define SYMCRYPT_AES_EXPANDED_KEY_magic (29*4*4+4+4)
|
||||
#endif
|
||||
|
||||
MACRO_START(ENC_MIX, keyptr)
|
||||
//
|
||||
// Perform the unkeyed mixing function for encryption
|
||||
// plus a key addition from the key pointer
|
||||
//
|
||||
// Input:
|
||||
// r0,r1,r2,r3 = current block
|
||||
// keyptr = pointer to current key
|
||||
// r12 = pointer to AesSboxMatrixMult
|
||||
//
|
||||
// Output:
|
||||
// r0,r1,r2,r3 = updated block
|
||||
// keyptr = updated to point to following key
|
||||
// r12 = unmodified
|
||||
//
|
||||
// Used:
|
||||
// r4,r5,r6,r7,lr are modified
|
||||
//
|
||||
// N.B. To make better use of ARM's barrel shifter, this code differs
|
||||
// from the AMD64 approach. The first lookups are not rotated;
|
||||
// instead all subsequent lookups are applied on top rotated, and
|
||||
// then a final rotation is performed to shift the bits into the
|
||||
// proper spot.
|
||||
//
|
||||
|
||||
uxtb r4, r0 // extract individual bytes from r0
|
||||
uxtb r7, r0, ror #8 //
|
||||
uxtb r6, r0, ror #16 //
|
||||
uxtb r5, r0, ror #24 //
|
||||
ldr r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
|
||||
ldr r7, [r12, r7, lsl #2] // the values unrotated for now
|
||||
ldr r6, [r12, r6, lsl #2] //
|
||||
ldr r5, [r12, r5, lsl #2] //
|
||||
|
||||
uxtb r0, r1 // extract individual bytes from r1
|
||||
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr lr, [r12, lr, lsl #2] //
|
||||
eor r5, r5, r0, ror #24 // exclusive-OR with previous
|
||||
eor r4, r4, lr, ror #24 //
|
||||
uxtb r0, r1, ror #16 // extract remaining bytes from r1
|
||||
uxtb r1, r1, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
eor r7, r7, r0, ror #24 // exclusive-OR with previous
|
||||
eor r6, r6, r1, ror #24 //
|
||||
|
||||
uxtb r0, r2 // extract individual bytes from r2
|
||||
uxtb r1, r2, ror #8 //
|
||||
uxtb lr, r2, ror #16 //
|
||||
uxtb r2, r2, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
ldr lr, [r12, lr, lsl #2] //
|
||||
ldr r2, [r12, r2, lsl #2] //
|
||||
eor r6, r6, r0, ror #16 // exclusive-OR with previous
|
||||
eor r5, r5, r1, ror #16 //
|
||||
eor r4, r4, lr, ror #16 //
|
||||
eor r7, r7, r2, ror #16 //
|
||||
|
||||
uxtb r0, r3 // extract individual bytes from r3
|
||||
uxtb r1, r3, ror #8 //
|
||||
uxtb r2, r3, ror #16 //
|
||||
uxtb r3, r3, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
ldr r2, [r12, r2, lsl #2] //
|
||||
ldr r3, [r12, r3, lsl #2] //
|
||||
eor r7, r7, r0, ror #8 // exclusive-OR with previous
|
||||
eor r6, r6, r1, ror #8 //
|
||||
eor r5, r5, r2, ror #8 //
|
||||
eor r4, r4, r3, ror #8 //
|
||||
|
||||
ldrd r0, r1, [keyptr, #0] // fetch key into r0-r3
|
||||
ldrd r2, r3, [keyptr, #8] //
|
||||
adds keyptr, keyptr, #16 // increment key pointer
|
||||
eors r0, r0, r4 // exclusive-OR the key and rotate into final
|
||||
eor r1, r1, r5, ror #8 // position
|
||||
eor r2, r2, r6, ror #16 //
|
||||
eor r3, r3, r7, ror #24 //
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(DEC_MIX, keyptr)
|
||||
//
|
||||
// Perform the unkeyed mixing function for decryption
|
||||
//
|
||||
// Input:
|
||||
// r0,r1,r2,r3 = current block
|
||||
// keyptr = pointer to current key
|
||||
// r12 = pointer to AesInvSboxMatrixMult
|
||||
//
|
||||
// Output:
|
||||
// r0,r1,r2,r3 = updated block
|
||||
// keyptr = updated to point to following key
|
||||
// r12 = unmodified
|
||||
//
|
||||
// Used:
|
||||
// r4,r5,r6,r7,lr are modified
|
||||
//
|
||||
// N.B. To make better use of ARM's barrel shifter, this code differs
|
||||
// from the AMD64 approach. The first lookups are not rotated;
|
||||
// instead all subsequent lookups are applied on top rotated, and
|
||||
// then a final rotation is performed to shift the bits into the
|
||||
// proper spot.
|
||||
//
|
||||
|
||||
uxtb r4, r0 // extract individual bytes from r0
|
||||
uxtb r5, r0, ror #8 //
|
||||
uxtb r6, r0, ror #16 //
|
||||
uxtb r7, r0, ror #24 //
|
||||
ldr r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
|
||||
ldr r5, [r12, r5, lsl #2] // the values unrotated for now
|
||||
ldr r6, [r12, r6, lsl #2] //
|
||||
ldr r7, [r12, r7, lsl #2] //
|
||||
|
||||
uxtb r0, r1 // extract individual bytes from r1
|
||||
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr lr, [r12, lr, lsl #2] //
|
||||
eor r5, r5, r0, ror #8 // exclusive-OR with previous
|
||||
eor r6, r6, lr, ror #8 //
|
||||
uxtb r0, r1, ror #16 // extract remaining bytes from r1
|
||||
uxtb r1, r1, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
eor r7, r7, r0, ror #8 // exclusive-OR with previous
|
||||
eor r4, r4, r1, ror #8 //
|
||||
|
||||
uxtb r0, r2 // extract individual bytes from r2
|
||||
uxtb r1, r2, ror #8 //
|
||||
uxtb lr, r2, ror #16 //
|
||||
uxtb r2, r2, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
ldr lr, [r12, lr, lsl #2] //
|
||||
ldr r2, [r12, r2, lsl #2] //
|
||||
eor r6, r6, r0, ror #16 // exclusive-OR with previous
|
||||
eor r7, r7, r1, ror #16 //
|
||||
eor r4, r4, lr, ror #16 //
|
||||
eor r5, r5, r2, ror #16 //
|
||||
|
||||
uxtb r0, r3 // extract individual bytes from r3
|
||||
uxtb r1, r3, ror #8 //
|
||||
uxtb r2, r3, ror #16 //
|
||||
uxtb r3, r3, ror #24 //
|
||||
ldr r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldr r1, [r12, r1, lsl #2] //
|
||||
ldr r2, [r12, r2, lsl #2] //
|
||||
ldr r3, [r12, r3, lsl #2] //
|
||||
eor r7, r7, r0, ror #24 // exclusive-OR with previous
|
||||
eor r4, r4, r1, ror #24 //
|
||||
eor r5, r5, r2, ror #24 //
|
||||
eor r6, r6, r3, ror #24 //
|
||||
|
||||
ldrd r0, r1, [keyptr, #0] // fetch key into r0-r3
|
||||
ldrd r2, r3, [keyptr, #8] //
|
||||
adds keyptr, keyptr, #16 // increment key pointer
|
||||
eors r0, r0, r4 // exclusive-OR the key and rotate into final
|
||||
eor r1, r1, r5, ror #24 // position
|
||||
eor r2, r2, r6, ror #16 //
|
||||
eor r3, r3, r7, ror #8 //
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_ENCRYPT, loopLabel)
|
||||
|
||||
//
|
||||
// Input:
|
||||
// r0,r1,r2,r3 = plaintext
|
||||
// r8 = pointer to first round key to use
|
||||
// r9 = pointer to last key to use
|
||||
// r12 = pointer to AesSboxMatrixMult
|
||||
//
|
||||
// Output:
|
||||
// r4,r5,r6,r7 = ciphertext
|
||||
// r8 = modified to point to last key
|
||||
// r9 = unmodified
|
||||
// r12 = unmodified
|
||||
//
|
||||
// Used:
|
||||
// lr is also modified
|
||||
//
|
||||
|
||||
//
|
||||
// xor in first round key
|
||||
//
|
||||
|
||||
ldrd r4, r5, [r8, #0] // fetch key in r4-r7
|
||||
ldrd r6, r7, [r8, #8] //
|
||||
eors r0, r0, r4 // exclusive-OR with the plaintext
|
||||
eors r1, r1, r5 //
|
||||
eors r2, r2, r6 //
|
||||
eors r3, r3, r7 //
|
||||
|
||||
add r8, r8, #16 // point to second key
|
||||
|
||||
loopLabel:
|
||||
//
|
||||
// Block is r0,r1,r2,r3
|
||||
// r8 points to current round key
|
||||
//
|
||||
|
||||
ENC_MIX r8 // encrypt the block and increment key
|
||||
cmp r8, r9 // are we at the end?
|
||||
blo loopLabel // loop until it is so
|
||||
|
||||
//
|
||||
// Now for the final round
|
||||
// We use the fact that SboxMatrixMult[0] table is also
|
||||
// an Sbox table if you use the second element of each entry.
|
||||
//
|
||||
// Result is in r4,r5,r6,r7
|
||||
//
|
||||
|
||||
add r12, r12, #1 // advance by 1 to point to second element
|
||||
uxtb r4, r0 // extract individual bytes from r0
|
||||
uxtb r7, r0, ror #8 //
|
||||
uxtb r6, r0, ror #16 //
|
||||
uxtb r5, r0, ror #24 //
|
||||
ldrb r4, [r12, r4, lsl #2] // perform lookups of each byte, leaving
|
||||
ldrb r7, [r12, r7, lsl #2] // the values unrotated for now
|
||||
ldrb r6, [r12, r6, lsl #2] //
|
||||
ldrb r5, [r12, r5, lsl #2] //
|
||||
|
||||
uxtb r0, r1 // extract individual bytes from r1
|
||||
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
|
||||
ldrb r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldrb lr, [r12, lr, lsl #2] //
|
||||
orr r5, r5, r0, lsl #8 // merge with previous
|
||||
orr r4, r4, lr, lsl #8 //
|
||||
uxtb r0, r1, ror #16 // extract remaining bytes from r1
|
||||
uxtb r1, r1, ror #24 //
|
||||
ldrb r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldrb r1, [r12, r1, lsl #2] //
|
||||
orr r7, r7, r0, lsl #8 // merge with previous
|
||||
orr r6, r6, r1, lsl #8 //
|
||||
|
||||
uxtb r0, r2 // extract individual bytes from r2
|
||||
uxtb r1, r2, ror #8 //
|
||||
uxtb lr, r2, ror #16 //
|
||||
uxtb r2, r2, ror #24 //
|
||||
ldrb r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldrb r1, [r12, r1, lsl #2] //
|
||||
ldrb lr, [r12, lr, lsl #2] //
|
||||
ldrb r2, [r12, r2, lsl #2] //
|
||||
orr r6, r6, r0, lsl #16 // merge with previous
|
||||
orr r5, r5, r1, lsl #16 //
|
||||
orr r4, r4, lr, lsl #16 //
|
||||
orr r7, r7, r2, lsl #16 //
|
||||
|
||||
uxtb r0, r3 // extract individual bytes from r3
|
||||
uxtb r1, r3, ror #8 //
|
||||
uxtb r2, r3, ror #16 //
|
||||
uxtb r3, r3, ror #24 //
|
||||
ldrb r0, [r12, r0, lsl #2] // perform lookups
|
||||
ldrb r1, [r12, r1, lsl #2] //
|
||||
ldrb r2, [r12, r2, lsl #2] //
|
||||
ldrb r3, [r12, r3, lsl #2] //
|
||||
orr r7, r7, r0, lsl #24 // merge with previous
|
||||
orr r6, r6, r1, lsl #24 //
|
||||
orr r5, r5, r2, lsl #24 //
|
||||
orr r4, r4, r3, lsl #24 //
|
||||
sub r12, r12, #1 // put r12 back to its original value
|
||||
|
||||
//
|
||||
// xor in final round key
|
||||
//
|
||||
|
||||
ldrd r0, r1, [r9, #0] // fetch key into r0-r3
|
||||
ldrd r2, r3, [r9, #8] //
|
||||
eors r4, r4, r0 // exclusive-OR the key and rotate into final
|
||||
eor r5, r1, r5, ror #8 // position
|
||||
eor r6, r2, r6, ror #16 //
|
||||
eor r7, r3, r7, ror #24 //
|
||||
MACRO_END()
|
||||
|
||||
MACRO_START(AES_DECRYPT, loopLabel)
|
||||
|
||||
//
|
||||
// Input:
|
||||
// r0,r1,r2,r3 = ciphertext
|
||||
// r8 = pointer to first round key to use
|
||||
// r9 = pointer to last key to use
|
||||
// r10 = pointer to InvSbox
|
||||
// r12 = pointer to InvSboxMatrixMult
|
||||
//
|
||||
// Output:
|
||||
// r4,r5,r6,r7 = plaintext
|
||||
// r8 = modified to point to last key
|
||||
// r9 = unmodified
|
||||
// r10 = unmodified
|
||||
// r12 = unmodified
|
||||
//
|
||||
// Used:
|
||||
// lr is also modified
|
||||
//
|
||||
|
||||
//
|
||||
// xor in first round key
|
||||
//
|
||||
ldrd r4, r5, [r8, #0] // fetch key in r4-r7
|
||||
ldrd r6, r7, [r8, #8] //
|
||||
eors r0, r0, r4 // exclusive-OR with the plaintext
|
||||
eors r1, r1, r5 //
|
||||
eors r2, r2, r6 //
|
||||
eors r3, r3, r7 //
|
||||
|
||||
add r8, r8, #16 // point to second key
|
||||
|
||||
loopLabel:
|
||||
//
|
||||
// Block is r0, r1, r2, r3
|
||||
// r8 points to current round key
|
||||
//
|
||||
|
||||
DEC_MIX r8 // decrypt the block and increment key
|
||||
cmp r8, r9 // are we at the end?
|
||||
blo loopLabel // loop until it is so
|
||||
|
||||
//
|
||||
// Now for the final round
|
||||
// Result is in r4, r5, r6, r7
|
||||
//
|
||||
|
||||
uxtb r4, r0 // extract individual bytes from r0
|
||||
uxtb r5, r0, ror #8 //
|
||||
uxtb r6, r0, ror #16 //
|
||||
uxtb r7, r0, ror #24 //
|
||||
ldrb r4, [r10, r4] // perform lookups of each byte, leaving
|
||||
ldrb r5, [r10, r5] // the values unrotated for now
|
||||
ldrb r6, [r10, r6] //
|
||||
ldrb r7, [r10, r7] //
|
||||
|
||||
uxtb r0, r1 // extract individual bytes from r1
|
||||
uxtb lr, r1, ror #8 // (with 1 more register we could do 4 at a time)
|
||||
ldrb r0, [r10, r0] // perform lookups
|
||||
ldrb lr, [r10, lr] //
|
||||
orr r5, r5, r0, lsl #24 // merge with previous
|
||||
orr r6, r6, lr, lsl #24 //
|
||||
uxtb r0, r1, ror #16 // extract remaining bytes from r1
|
||||
uxtb r1, r1, ror #24 //
|
||||
ldrb r0, [r10, r0] // perform lookups
|
||||
ldrb r1, [r10, r1] //
|
||||
orr r7, r7, r0, lsl #24 // merge with previous
|
||||
orr r4, r4, r1, lsl #24 //
|
||||
|
||||
uxtb r0, r2 // extract individual bytes from r2
|
||||
uxtb r1, r2, ror #8 //
|
||||
uxtb lr, r2, ror #16 //
|
||||
uxtb r2, r2, ror #24 //
|
||||
ldrb r0, [r10, r0] // perform lookups
|
||||
ldrb r1, [r10, r1] //
|
||||
ldrb lr, [r10, lr] //
|
||||
ldrb r2, [r10, r2] //
|
||||
orrs r6, r6, r0, lsl #16 // merge with previous
|
||||
orr r7, r7, r1, lsl #16 //
|
||||
orr r4, r4, lr, lsl #16 //
|
||||
orr r5, r5, r2, lsl #16 //
|
||||
|
||||
uxtb r0, r3 // extract individual bytes from r3
|
||||
uxtb r1, r3, ror #8 //
|
||||
uxtb r2, r3, ror #16 //
|
||||
uxtb r3, r3, ror #24 //
|
||||
ldrb r0, [r10, r0] // perform lookups
|
||||
ldrb r1, [r10, r1] //
|
||||
ldrb r2, [r10, r2] //
|
||||
ldrb r3, [r10, r3] //
|
||||
orr r7, r7, r0, lsl #8 // merge with previous
|
||||
orr r4, r4, r1, lsl #8 //
|
||||
orr r5, r5, r2, lsl #8 //
|
||||
orr r6, r6, r3, lsl #8 //
|
||||
|
||||
//
|
||||
// xor in final round key
|
||||
//
|
||||
|
||||
ldrd r0, r1, [r9, #0] // fetch key into r0-r3
|
||||
ldrd r2, r3, [r9, #8] //
|
||||
eors r4, r4, r0 // exclusive-OR the key and rotate into final
|
||||
eor r5, r1, r5, ror #24 // position
|
||||
eor r6, r2, r6, ror #16 //
|
||||
eor r7, r3, r7, ror #8 //
|
||||
|
||||
MACRO_END()
|
||||
|
||||
|
||||
//
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptAesEncrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbPlaintext,
|
||||
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbCiphertext )
|
||||
//
|
||||
// NESTED_ENTRY SymCryptAesEncryptAsm
|
||||
FUNCTION_START(SymCryptAesEncryptAsm, 3, 12)
|
||||
|
||||
//
|
||||
// Input parameters:
|
||||
// r0 = pExpandedKey
|
||||
// r1 = pbPlaintext
|
||||
// r2 = pbCiphertext
|
||||
//
|
||||
|
||||
// push {r2, r4-r11, lr}
|
||||
push {r2}
|
||||
|
||||
//
|
||||
// Stack layout:
|
||||
// [sp] = r2 = pbCipherText
|
||||
//
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesEncryptAsm_check_magic_label
|
||||
|
||||
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last key
|
||||
mov r8, r0 // r8 = first key
|
||||
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = matrix mult table
|
||||
ldr r12, =SymCryptAesSboxMatrixMult // r12 = matrix mult table
|
||||
|
||||
ldr r0, [r1, #0] // load the plaintext
|
||||
ldr r2, [r1, #8] //
|
||||
ldr r3, [r1, #12] //
|
||||
ldr r1, [r1, #4] //
|
||||
|
||||
AES_ENCRYPT SymCryptAesEncryptAsm_loopLabel
|
||||
//
|
||||
// Plaintext in r0, r1, r2, r3
|
||||
// r8 points to first round key to use
|
||||
// r9 is last key to use (unchanged)
|
||||
// r12 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in r4, r5, r6, r7
|
||||
//
|
||||
|
||||
ldr r0, [sp] // recover pbCipherText
|
||||
str r4, [r0, #0] // store the encrypted data
|
||||
str r5, [r0, #4] //
|
||||
str r6, [r0, #8] //
|
||||
str r7, [r0, #12] //
|
||||
|
||||
// pop {r2, r4-r11, pc} // return
|
||||
pop {r2}
|
||||
|
||||
// NESTED_END SymCryptAesEncryptAsm
|
||||
FUNCTION_END(SymCryptAesEncryptAsm)
|
||||
|
||||
//
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptAesDecrypt( _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PCBYTE pbCiphertext,
|
||||
// _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE pbPlaintext )
|
||||
//
|
||||
// NESTED_ENTRY SymCryptAesDecryptAsm
|
||||
FUNCTION_START(SymCryptAesDecryptAsm, 3, 12)
|
||||
|
||||
//
|
||||
// Input parameters:
|
||||
// r0 = pExpandedKey
|
||||
// r1 = pbCiphertext
|
||||
// r2 = pbPlaintext
|
||||
//
|
||||
|
||||
// push {r2, r4-r11, lr}
|
||||
push {r2}
|
||||
|
||||
//
|
||||
// Stack layout:
|
||||
// [sp] = r2 = pbPlaintext
|
||||
//
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesDecryptAsm_check_magic_label
|
||||
|
||||
ldr r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r8 = first key
|
||||
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last key
|
||||
// mov32 r10, SymCryptAesInvSbox // r10 = inverse sbox table
|
||||
// mov32 r12, SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
|
||||
ldr r10, =SymCryptAesInvSbox // r10 = inverse sbox table
|
||||
ldr r12, =SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
|
||||
|
||||
ldr r0, [r1, #0] // load the ciphertext
|
||||
ldr r2, [r1, #8] //
|
||||
ldr r3, [r1, #12] //
|
||||
ldr r1, [r1, #4] //
|
||||
|
||||
AES_DECRYPT SymCryptAesDecryptAsm_loopLabel
|
||||
//
|
||||
// Ciphertext in r0, r1, r2, r3
|
||||
// r8 points to first round key to use
|
||||
// r9 is last key to use (unchanged)
|
||||
// r10 points to InvSbox (unchanged)
|
||||
// r12 points to InvSboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in r4, r5, r6, r7
|
||||
//
|
||||
|
||||
ldr r0, [sp] // recover pbPlaintext
|
||||
str r4, [r0, #0] // store the decrypted data
|
||||
str r5, [r0, #4] //
|
||||
str r6, [r0, #8] //
|
||||
str r7, [r0, #12] //
|
||||
|
||||
pop {r2}
|
||||
// pop {r2, r4-r11, pc} // return
|
||||
|
||||
// NESTED_END SymCryptAesDecryptAsm
|
||||
FUNCTION_END(SymCryptAesDecryptAsm)
|
||||
|
||||
|
||||
//
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptAesCbcEncrypt(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
// NESTED_ENTRY SymCryptAesCbcEncryptAsm
|
||||
FUNCTION_START(SymCryptAesCbcEncryptAsm, 4, 12)
|
||||
|
||||
//
|
||||
// Input parameters:
|
||||
// r0 = pExpandedKey
|
||||
// r1 = pbChainingValue
|
||||
// r2 = pbSrc
|
||||
// r3 = pbDst
|
||||
// [sp] = cbData
|
||||
//
|
||||
|
||||
// push {r0-r2, r4-r11, lr}
|
||||
push {r0-r2}
|
||||
sub sp, sp, #16
|
||||
|
||||
//
|
||||
// Stack layout:
|
||||
// [sp] = pbSrc
|
||||
// [sp+4] = pbSrcEnd
|
||||
// [sp+16] = r0 = pbExpandedKey
|
||||
// [sp+20] = r1 = pbChainingValue
|
||||
// [sp+24] = r2 = pbSrc
|
||||
// [sp+64] = cbData
|
||||
//
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcEncryptAsm_check_magic_label
|
||||
|
||||
pld [r2] // prefetch source data
|
||||
ldr r4, [sp, #64] // r4 = cbData
|
||||
mov r8, r2 // r8 = pbSrc on loop entry
|
||||
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
|
||||
mov r10, r3 // r10 = pbDst
|
||||
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
|
||||
ldr r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
|
||||
bics r4, r4, #15 // r4 &= ~15
|
||||
beq SymCryptAesCbcEncryptNoData // skip if no data
|
||||
adds r4, r4, r2 // r4 = pbSrc + cbData
|
||||
// strd r2, r4, [sp] // save pbSrc/pbSrcEnd at [sp]
|
||||
str r2, [sp]
|
||||
str r4, [sp, #4]
|
||||
pld [r8, #32] // prefetch source data
|
||||
ldr r4, [r1, #0] // load chaining state from pbChainingValue
|
||||
ldr r5, [r1, #4] //
|
||||
ldr r6, [r1, #8] //
|
||||
ldr r7, [r1, #12] //
|
||||
|
||||
SymCryptAesCbcEncryptAsmLoop:
|
||||
//
|
||||
// Loop register setup
|
||||
// r4,r5,r6,r7 = chaining state
|
||||
// r8 = pbSrc
|
||||
// r9 = last round key to use
|
||||
// r10 = pbDst
|
||||
// r12 = SboxMatrixMult
|
||||
//
|
||||
|
||||
ldr r0, [r8] // read next 16 bytes of plaintext
|
||||
ldr r1, [r8, #4] //
|
||||
ldr r2, [r8, #8] //
|
||||
ldr r3, [r8, #12] //
|
||||
pld [r8, #64] // prefetch source data
|
||||
add r8, r8, #16 // pbSrc += 16
|
||||
str r8, [sp] // save it
|
||||
|
||||
eors r0, r0, r4 // exclusive-OR against chaining value
|
||||
eors r1, r1, r5 //
|
||||
eors r2, r2, r6 //
|
||||
eors r3, r3, r7 //
|
||||
|
||||
ldr r8, [sp, #16] // r8 = first round key
|
||||
AES_ENCRYPT SymCryptAesCbcEncryptAsm_loopLabel
|
||||
//
|
||||
// Plaintext in r0, r1, r2, r3
|
||||
// r8 points to first round key to use
|
||||
// r9 is last key to use (unchanged)
|
||||
// r12 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in r4, r5, r6, r7
|
||||
//
|
||||
|
||||
// ldrd r8, r0, [sp] // fetch pbSrc/pbSrcEnd
|
||||
ldr r8, [sp]
|
||||
ldr r0, [sp, #4]
|
||||
|
||||
str r4, [r10, #0] // write ciphertext
|
||||
str r5, [r10, #4] //
|
||||
str r6, [r10, #8] //
|
||||
str r7, [r10, #12] //
|
||||
add r10, r10, #16 // pbDst += 16
|
||||
|
||||
cmp r8, r0 // are we at the end of source?
|
||||
blo SymCryptAesCbcEncryptAsmLoop // loop until we are
|
||||
|
||||
ldr r0, [sp, #20] // r0 = pbChainingValue
|
||||
str r4, [r0, #0] // update the chaining value
|
||||
str r5, [r0, #4] //
|
||||
str r6, [r0, #8] //
|
||||
str r7, [r0, #12] //
|
||||
|
||||
SymCryptAesCbcEncryptNoData:
|
||||
|
||||
add sp, sp, #16
|
||||
pop {r0-r2}
|
||||
// pop {r0-r2, r4-r11, pc} // return
|
||||
|
||||
// NESTED_END SymCryptAesCbcEncryptAsm
|
||||
FUNCTION_END(SymCryptAesCbcEncryptAsm)
|
||||
|
||||
|
||||
//
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptAesCbcDecrypt(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
// NESTED_ENTRY SymCryptAesCbcDecryptAsm
|
||||
FUNCTION_START(SymCryptAesCbcDecryptAsm, 4, 12)
|
||||
|
||||
//
|
||||
// Input parameters:
|
||||
// r0 = pExpandedKey
|
||||
// r1 = pbChainingValue
|
||||
// r2 = pbSrc
|
||||
// r3 = pbDst
|
||||
// [sp] = cbData
|
||||
//
|
||||
|
||||
// push {r0-r2, r4-r11, lr}
|
||||
push {r0-r2}
|
||||
sub sp, sp, #32
|
||||
|
||||
//
|
||||
// Stack layout:
|
||||
// [sp] = pbSrc
|
||||
// [sp+4] = pbDst
|
||||
// [sp+8] = pbSrcEnd
|
||||
// [sp+16] = saved chaining value
|
||||
// [sp+32] = r0 = pbExpandedKey
|
||||
// [sp+36] = r1 = pbChainingValue
|
||||
// [sp+40] = r2 = pbSrc
|
||||
// [sp+80] = cbData
|
||||
//
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcDecryptAsm_check_magic_label
|
||||
|
||||
ldr r4, [sp, #80] // r4 = cbData
|
||||
bics r4, r4, #15 // r4 &= ~15
|
||||
beq SymCryptAesCbcDecryptNoData // skip if no data
|
||||
|
||||
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last enc round key (invariant)
|
||||
ldr r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey]
|
||||
|
||||
subs r4, r4, #16
|
||||
adds r3, r3, r4
|
||||
adds r2, r2, r4
|
||||
pld [r2] // prefetch source data
|
||||
str r3, [sp, #4]
|
||||
str r2, [sp, #0]
|
||||
str r8, [sp, #8]
|
||||
|
||||
// mov32 r10, SymCryptAesInvSbox
|
||||
// mov32 r12, SymCryptAesInvSboxMatrixMult
|
||||
ldr r10, =SymCryptAesInvSbox
|
||||
ldr r12, =SymCryptAesInvSboxMatrixMult
|
||||
|
||||
//
|
||||
// Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
|
||||
//
|
||||
pld [r2, #-32] // prefetch source data
|
||||
ldr r0, [r2, #0]
|
||||
ldr r1, [r2, #4]
|
||||
ldr r3, [r2, #12]
|
||||
ldr r2, [r2, #8]
|
||||
|
||||
strd r0, r1, [sp, #16]
|
||||
strd r2, r3, [sp, #24]
|
||||
|
||||
b SymCryptAesCbcDecryptAsmLoopEntry
|
||||
|
||||
SymCryptAesCbcDecryptAsmLoop:
|
||||
// Loop register setup
|
||||
// r13 = first round key to use
|
||||
// r14 = pbSrc
|
||||
// r15 = pbDst
|
||||
// [callerP3Home] = pbSrcStart
|
||||
|
||||
// current ciphertext block (esi,edi,ebp,r8d)
|
||||
|
||||
ldr r0, [r8, #-16]
|
||||
ldr r1, [r8, #-12]
|
||||
ldr r2, [r8, #-8]
|
||||
ldr r3, [r8, #-4]
|
||||
pld [r8, #-64] // prefetch source data
|
||||
|
||||
eors r4, r4, r0
|
||||
eors r5, r5, r1
|
||||
eors r6, r6, r2
|
||||
eors r7, r7, r3
|
||||
|
||||
str r4, [lr, #0]
|
||||
str r5, [lr, #4]
|
||||
str r6, [lr, #8]
|
||||
str r7, [lr, #12]
|
||||
|
||||
sub lr, lr, #16
|
||||
sub r8, r8, #16
|
||||
str r8, [sp]
|
||||
str lr, [sp, #4]
|
||||
|
||||
SymCryptAesCbcDecryptAsmLoopEntry:
|
||||
|
||||
ldr r8, [sp, #8]
|
||||
|
||||
AES_DECRYPT SymCryptAesCbcDecryptAsm_loopLabel
|
||||
|
||||
// ldrd r8, lr, [sp, #0]
|
||||
ldr r8, [sp, #0]
|
||||
ldr lr, [sp, #4]
|
||||
ldr r0, [sp, #40]
|
||||
cmp r8, r0
|
||||
bhi SymCryptAesCbcDecryptAsmLoop
|
||||
|
||||
ldr r8, [sp, #36] // r8 = pbChainingValue
|
||||
ldr r0, [r8, #0]
|
||||
ldr r1, [r8, #4]
|
||||
ldr r2, [r8, #8]
|
||||
ldr r3, [r8, #12]
|
||||
|
||||
eors r4, r4, r0
|
||||
eors r5, r5, r1
|
||||
eors r6, r6, r2
|
||||
eors r7, r7, r3
|
||||
|
||||
str r4, [lr, #0]
|
||||
str r5, [lr, #4]
|
||||
str r6, [lr, #8]
|
||||
str r7, [lr, #12]
|
||||
|
||||
//
|
||||
// Update the chaining value to the last ciphertext block
|
||||
//
|
||||
ldrd r0, r1, [sp, #16]
|
||||
ldrd r2, r3, [sp, #24]
|
||||
str r0, [r8, #0]
|
||||
str r1, [r8, #4]
|
||||
str r2, [r8, #8]
|
||||
str r3, [r8, #12]
|
||||
|
||||
SymCryptAesCbcDecryptNoData:
|
||||
|
||||
add sp, sp, #32
|
||||
pop {r0-r2}
|
||||
// pop {r0-r2, r4-r11, pc}
|
||||
|
||||
// NESTED_END SymCryptAesCbcDecryptAsm
|
||||
FUNCTION_END(SymCryptAesCbcDecryptAsm)
|
||||
|
||||
//
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptAesCtrMsb64(
|
||||
// _In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
|
||||
// _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
|
||||
// _In_reads_bytes_( cbData ) PCBYTE pbSrc,
|
||||
// _Out_writes_bytes_( cbData ) PBYTE pbDst,
|
||||
// SIZE_T cbData )
|
||||
|
||||
// NESTED_ENTRY SymCryptAesCtrMsb64Asm
|
||||
FUNCTION_START(SymCryptAesCtrMsb64Asm, 4, 12):
|
||||
|
||||
//
|
||||
// Input parameters:
|
||||
// r0 = pExpandedKey
|
||||
// r1 = pbChainingValue
|
||||
// r2 = pbSrc
|
||||
// r3 = pbDst
|
||||
// [sp] = cbData
|
||||
//
|
||||
|
||||
// push {r0-r2, r4-r11, lr}
|
||||
push {r0-r2}
|
||||
sub sp, sp, #32
|
||||
|
||||
//
|
||||
// Stack layout:
|
||||
// [sp] = pbDst
|
||||
// [sp+4] = pbSrcEnd
|
||||
// [sp+16] = local copy of chaining data
|
||||
// [sp+32] = r0 = pbExpandedKey
|
||||
// [sp+36] = r1 = pbChainingValue
|
||||
// [sp+40] = r2 = pbSrc
|
||||
// [sp+80] = cbData
|
||||
//
|
||||
|
||||
SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCtrMsb64Asm_check_magic_label
|
||||
|
||||
pld [r2] // prefetch source data
|
||||
ldr r4, [sp, #80] // r4 = cbData
|
||||
ldr r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
|
||||
mov r10, r2 // r10 = pbSrc
|
||||
// mov32 r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
|
||||
ldr r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
|
||||
bics r4, r4, #15 // r4 &= ~15
|
||||
beq SymCryptAesCtrMsb64NoData // skip if no data
|
||||
adds r4, r4, r2 // r4 = pbSrc + cbData
|
||||
// strd r3, r4, [sp] // save pbDst/pbSrcEnd at [sp]
|
||||
str r3, [sp] // save pbDst at [sp]
|
||||
str r4, [sp, #4] // save pbSrcEnd at [sp, #4]
|
||||
|
||||
pld [r10, #32] // prefetch source data
|
||||
mov r3, r1 // load chaining state from pbChainingValue
|
||||
ldr r0, [r3, #0] //
|
||||
ldr r1, [r3, #4] //
|
||||
ldr r2, [r3, #8] //
|
||||
ldr r3, [r3, #12] //
|
||||
|
||||
strd r0, r1, [sp, #16] // save a local copy
|
||||
strd r2, r3, [sp, #24] //
|
||||
|
||||
SymCryptAesCtrMsb64AsmLoop:
|
||||
//
|
||||
// Loop register setup
|
||||
// r0,r1,r2,r3 = chaining state
|
||||
// r8 = pbSrc
|
||||
// r9 = last round key to use
|
||||
// r10 = pbSrc
|
||||
// r12 = SboxMatrixMult
|
||||
//
|
||||
|
||||
ldr r8, [sp, #32] // r8 = first round key
|
||||
AES_ENCRYPT loopLabel
|
||||
//
|
||||
// Plaintext in r0, r1, r2, r3
|
||||
// r8 points to first round key to use
|
||||
// r9 is last key to use (unchanged)
|
||||
// r12 points to SboxMatrixMult (unchanged)
|
||||
// Ciphertext ends up in r4, r5, r6, r7
|
||||
//
|
||||
|
||||
ldr r0, [r10, #0] // load plaintext
|
||||
ldr r1, [r10, #4] //
|
||||
ldr r2, [r10, #8] //
|
||||
ldr r3, [r10, #12] //
|
||||
pld [r10, #64] // prefetch source data
|
||||
|
||||
// ldrd r8, lr, [sp] // fetch pbDst/pbSrcEnd
|
||||
ldr r8, [sp]
|
||||
ldr lr, [sp, #4]
|
||||
|
||||
eors r0, r0, r4 // exclusive-OR against encrypt results
|
||||
eors r1, r1, r5 //
|
||||
eors r2, r2, r6 //
|
||||
eors r3, r3, r7 //
|
||||
|
||||
str r0, [r8, #0] // store to destination
|
||||
str r1, [r8, #4] //
|
||||
str r2, [r8, #8] //
|
||||
str r3, [r8, #12] //
|
||||
|
||||
ldrd r0, r1, [sp, #16] // load chaining state
|
||||
ldrd r2, r3, [sp, #24] //
|
||||
|
||||
add r8, r8, #16 // pbDst += 16
|
||||
add r10, r10, #16 // pbSrc += 16
|
||||
str r8, [sp] // save pbDst
|
||||
|
||||
rev r3, r3 // reverse the second qword
|
||||
rev r2, r2 //
|
||||
adds r3, r3, #1 // increment the counter
|
||||
adcs r2, r2, #0 //
|
||||
rev r3, r3 // re-reverse the second word
|
||||
rev r2, r2 //
|
||||
strd r2, r3, [sp, #24] // write updated state
|
||||
|
||||
cmp r10, lr // done?
|
||||
blo SymCryptAesCtrMsb64AsmLoop // loop until finished
|
||||
|
||||
ldr r0, [sp, #36] // get pbChainingValue
|
||||
movs r1, #0 // get 0 in r1
|
||||
str r2, [r0, #8] // write back modified part of chaining state
|
||||
str r3, [r0, #12] //
|
||||
// strd r1, r1, [sp, #16] // wipe the stack copy
|
||||
// strd r1, r1, [sp, #24] //
|
||||
str r1, [sp, #16]
|
||||
str r1, [sp, #20]
|
||||
str r1, [sp, #24]
|
||||
str r1, [sp, #28]
|
||||
|
||||
SymCryptAesCtrMsb64NoData:
|
||||
|
||||
add sp, sp, #32
|
||||
pop {r0-r2}
|
||||
// pop {r0-r2, r4-r11, pc} // return
|
||||
|
||||
// NESTED_END SymCryptAesCtrMsb64Asm
|
||||
FUNCTION_END(SymCryptAesCtrMsb64Asm)
|
|
@ -0,0 +1,790 @@
|
|||
//
|
||||
// fdef_asm.cppasm Assembler code for large integer arithmetic in the default data format for the arm architecture
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
// #include "ksarm.h"
|
||||
|
||||
#include "C_asm_shared.inc"
|
||||
|
||||
// A digit consists of 4 words of 32 bits each
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
// SymCryptFdefRawAdd(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 pSrc2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 pDst,
|
||||
// UINT32 nDigits )
|
||||
//
|
||||
// Initial inputs to registers:
|
||||
// pSrc1 -> r0
|
||||
// pSrc2 -> r1
|
||||
// pDst -> r2
|
||||
// nDigits -> r3
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefRawAddAsm
|
||||
FUNCTION_START(SymCryptFdefRawAddAsm, 4, 10)
|
||||
// push {r4-r9, lr}
|
||||
|
||||
neg r3, r3 // negate the digit count
|
||||
mov r8, #0 // carry = r8 = 0
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
SymCryptFdefRawAddAsmLoop:
|
||||
rrxs r8, r8 // set the carry flag if bit[0] of r8 is set
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
adcs r4, r4, r5
|
||||
adcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
adcs r4, r4, r5
|
||||
adcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
adc r8, r9, r9 // r8 = 1 if the carry flag is set
|
||||
|
||||
adds r3, r3, #1 // Increment the digit count by one
|
||||
bne SymCryptFdefRawAddAsmLoop
|
||||
|
||||
mov r0, r8 // Set the return value equal to the carry
|
||||
|
||||
// pop {r4-r9, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefRawAddAsm
|
||||
FUNCTION_END(SymCryptFdefRawAddAsm)
|
||||
|
||||
//UINT32
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawSub(
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src1,
|
||||
// _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCUINT32 Src2,
|
||||
// _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32 Dst,
|
||||
// UINT32 nDigits )
|
||||
//
|
||||
// Initial inputs to registers:
|
||||
// pSrc1 -> r0
|
||||
// pSrc2 -> r1
|
||||
// pDst -> r2
|
||||
// nDigits -> r3
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefRawSubAsm
|
||||
FUNCTION_START(SymCryptFdefRawSubAsm, 4, 10)
|
||||
// push {r4-r9, lr}
|
||||
|
||||
neg r3, r3 // negate the digit count
|
||||
mov r8, #0 // borrow = r8 = 0
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
SymCryptFdefRawSubAsmLoop:
|
||||
subs r8, r9, r8 // if r8>0 then the "borrow flag" is set
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
sbcs r4, r4, r5
|
||||
sbcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
sbcs r4, r4, r5
|
||||
sbcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
sbc r8, r9, r9 // If borrow=1, then r8 = -1 = 0xffffffff
|
||||
|
||||
adds r3, r3, #1 // Increment the digit count by one
|
||||
bne SymCryptFdefRawSubAsmLoop
|
||||
|
||||
and r0, r8, #1 // If r8>0, set the return value to 1
|
||||
|
||||
// pop {r4-r9, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefRawSubAsm
|
||||
FUNCTION_END(SymCryptFdefRawSubAsm)
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefMaskedCopy(
|
||||
// _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PCBYTE pbSrc,
|
||||
// _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PBYTE pbDst,
|
||||
// UINT32 nDigits,
|
||||
// UINT32 mask )
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefMaskedCopyAsm
|
||||
FUNCTION_START(SymCryptFdefMaskedCopyAsm, 4, 10)
|
||||
// push {r4-r9, lr}
|
||||
|
||||
neg r2, r2 // negate the digit count
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
subs r4, r9, r3 // If (r3 > 0) clear the carry flag (i.e. borrow)
|
||||
sbc r3, r9, r9 // r3 = mask = 0xffffffff if the carry flag is clear
|
||||
// orn r9, r9, r3 // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
|
||||
orr r9, r9, r3
|
||||
mvn r9, r9
|
||||
|
||||
mov r8, r1 // save the destination pointer
|
||||
|
||||
SymCryptFdefMaskedCopyAsmLoop:
|
||||
ldmia r0!, {r4, r6} // Load two words of the source
|
||||
ldmia r1!, {r5, r7} // Load two words of the destination
|
||||
and r4, r4, r3
|
||||
and r5, r5, r9
|
||||
orr r4, r4, r5
|
||||
and r6, r6, r3
|
||||
and r7, r7, r9
|
||||
orr r6, r6, r7
|
||||
stmia r8!, {r4, r6} // Store the two words in the destination
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of the source
|
||||
ldmia r1!, {r5, r7} // Load two words of the destination
|
||||
and r4, r4, r3
|
||||
and r5, r5, r9
|
||||
orr r4, r4, r5
|
||||
and r6, r6, r3
|
||||
and r7, r7, r9
|
||||
orr r6, r6, r7
|
||||
stmia r8!, {r4, r6} // Store the two words in the destination
|
||||
|
||||
adds r2, r2, #1 // Increment the digit count by one
|
||||
bne SymCryptFdefMaskedCopyAsmLoop
|
||||
|
||||
// Done, no return value
|
||||
|
||||
// pop {r4-r9, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefMaskedCopyAsm
|
||||
FUNCTION_END(SymCryptFdefMaskedCopyAsm)
|
||||
|
||||
//VOID
|
||||
//SYMCRYPT_CALL
|
||||
//SymCryptFdefRawMul(
|
||||
// _In_reads_(nWords1) PCUINT32 pSrc1,
|
||||
// UINT32 nDigits1,
|
||||
// _In_reads_(nWords2) PCUINT32 pSrc2,
|
||||
// UINT32 nDigits2,
|
||||
// _Out_writes_(nWords1 + nWords2) PUINT32 pDst )
|
||||
//
|
||||
// Initial inputs to registers:
|
||||
// pSrc1 -> r0
|
||||
// nDigits1 -> r1
|
||||
// pSrc2 -> r2
|
||||
// nDigits2 -> r3
|
||||
// pDst -> In the stack
|
||||
//
|
||||
// Basic structure:
|
||||
// for each 2 words in Src1:
|
||||
// Dst += Src2 * (2 words of Src1)
|
||||
//
|
||||
// Register assignments
|
||||
// r0 = pSrc1 (moving forward one word every outer loop)
|
||||
// r1 = negated word count of pSrc1
|
||||
// r2 = pSrc2 (moving forward one *digit* every inner loop)
|
||||
// r3 = negated digit count of pSrc2 and pDst
|
||||
// r4 = pDst (moving forward one *digit* every inner loop)
|
||||
// r5 = Stored pDst (moving forward one word every outer loop)
|
||||
// r6, r7 = Current words loaded from pSrc1
|
||||
// r8, r9 = Current words loaded from pSrc2
|
||||
// <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
|
||||
//
|
||||
// Stack assignments
|
||||
#define pSrc2 0 // Stored pSrc2 in stack
|
||||
#define nDigits2 4 // Stored negated digit count of pSrc2 in stack
|
||||
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefRawMulAsm
|
||||
FUNCTION_START(SymCryptFdefRawMulAsm, 4, 13)
|
||||
// push {r4-r12, lr}
|
||||
sub sp, sp, #8
|
||||
|
||||
lsl r1, r1, #2 // Calculate word count
|
||||
|
||||
ldr r4, [sp, #(8+4*10)] // load pDst
|
||||
|
||||
neg r1, r1 // negate nWords1
|
||||
neg r3, r3 // negate nDigits2
|
||||
|
||||
mov r5, r4 // store pDst
|
||||
str r2, [sp, #pSrc2] // store pSrc2
|
||||
str r3, [sp, #nDigits2] // store -nDigits2 for later
|
||||
|
||||
//
|
||||
// First iteration of main loop (no adding of previous values from pDst)
|
||||
//
|
||||
mov r11, #0 // Setting r11 = 0
|
||||
mov r12, #0 // and r12 = 0
|
||||
ldmia r0!, {r6, r7} // Load two words from pSrc1
|
||||
|
||||
SymCryptFdefRawMulAsmLoopInner1:
|
||||
|
||||
adds r3, r3, #1 // move one digit up
|
||||
|
||||
ldmia r2!, {r8, r9} // Load two words from pSrc2
|
||||
|
||||
mov r10, #0 // Setting r10 = 0
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
|
||||
|
||||
mov r10, #0 // Setting r10 = 0
|
||||
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
|
||||
|
||||
ldmia r2!, {r8, r9} // Load two words from pSrc2
|
||||
|
||||
mov r10, #0 // Setting r10 = 0
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
|
||||
|
||||
mov r10, #0 // Setting r10 = 0
|
||||
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
|
||||
|
||||
bne SymCryptFdefRawMulAsmLoopInner1
|
||||
|
||||
stmia r4, {r11, r12} // Store the top two words in the destination
|
||||
|
||||
add r1, r1, #2 // move two words up
|
||||
add r5, r5, #8 // move start of pDst two words up
|
||||
|
||||
//
|
||||
// MAIN LOOP
|
||||
//
|
||||
SymCryptFdefRawMulAsmLoopOuter:
|
||||
ldr r3, [sp, #nDigits2] // set -nDigits2
|
||||
ldr r2, [sp, #pSrc2] // set pSrc2
|
||||
mov r4, r5 // set pDst
|
||||
|
||||
mov r11, #0 // Setting r11 = 0
|
||||
mov r12, #0 // and r12 = 0
|
||||
ldmia r0!, {r6, r7} // Load two words from pSrc1
|
||||
|
||||
SymCryptFdefRawMulAsmLoopInner:
|
||||
|
||||
adds r3, r3, #1 // move one digit up
|
||||
|
||||
ldmia r2!, {r8, r9} // Load two words from pSrc2
|
||||
|
||||
ldr r10, [r4] // load 1 word from pDst
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
|
||||
|
||||
ldr r10, [r4] // load 1 word from pDst
|
||||
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
|
||||
|
||||
ldmia r2!, {r8, r9} // Load two words from pSrc2
|
||||
|
||||
ldr r10, [r4] // load 1 word from pDst
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r8 // <r12:r11> = r7 * r8 + r11
|
||||
|
||||
ldr r10, [r4] // load 1 word from pDst
|
||||
umaal r10, r11, r6, r9 // <r11:r10> = r6 * r9 + r10 + r11
|
||||
str r10, [r4], #4 // Store to destination
|
||||
umaal r11, r12, r7, r9 // <r12:r11> = r7 * r9 + r11
|
||||
|
||||
bne SymCryptFdefRawMulAsmLoopInner
|
||||
|
||||
adds r1, r1, #2 // move two words up
|
||||
add r5, r5, #8 // move start of pDst two words up
|
||||
|
||||
stmia r4, {r11, r12} // Store the top two words in the destination
|
||||
|
||||
bne SymCryptFdefRawMulAsmLoopOuter
|
||||
|
||||
// Done, no return value
|
||||
|
||||
add sp, sp, #8
|
||||
// pop {r4-r12, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefRawMulAsm
|
||||
FUNCTION_END(SymCryptFdefRawMulAsm)
|
||||
|
||||
// Macro for the first loop of the first pass of RawSquareAsm.
|
||||
// It takes one word from the source, multiplies it with the mulword,
|
||||
// adds the high level word of the previous macro call, and stores it into
|
||||
// the destination.
|
||||
//
|
||||
// No word is taken from the destination// thus r10 is always set to 0.
|
||||
//
|
||||
// No carry flag is propagated from the previous macro call as the maximum is
|
||||
// (2^32-1)^2 + 2^32-1 = 2^64 - 2^32
|
||||
MACRO_START(SQR_SINGLEADD_32, index)
|
||||
|
||||
mov r10, #0
|
||||
ldr r8, [r2, #4*index] // pSrc[i+j]
|
||||
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
|
||||
str r10, [r4, #4*index] // Store to destination
|
||||
MACRO_END()
|
||||
|
||||
// Macro for the remaining loops of the first pass of RawSquareAsm.
|
||||
// The only difference to the above is that it also adds the word loaded
|
||||
// from the destination buffer.
|
||||
//
|
||||
// No carry flag is propagated from the previous macro call as the maximum is
|
||||
// (2^32-1)^2 + 2(2^32-1) = 2^64 - 1
|
||||
MACRO_START(SQR_DOUBLEADD_32, index)
|
||||
|
||||
ldr r8, [r2, #4*index] // pSrc[i+j]
|
||||
ldr r10, [r4, #4*index] // pDst[2*(i+j)]
|
||||
|
||||
umaal r10, r11, r6, r8 // <r11:r10> = r6 * r8 + r10 + r11
|
||||
|
||||
str r10, [r4, #4*index] // Store to destination
|
||||
|
||||
MACRO_END()
|
||||
|
||||
// Macro for the third pass loop of RawSquareAsm.
|
||||
// It takes one mulword from the source, squares it, and
|
||||
// adds it to the even columns of the destination. The carries are propagated
|
||||
// to the odd columns.
|
||||
//
|
||||
// Here we can have a (1-bit) carry to the next call because the maximum value for
|
||||
// a pair of columns is (2^32-1)^2+(2^64-1)+1 = 2^65 - 2^33 + 1 < 2^65 - 1
|
||||
MACRO_START(SQR_DIAGONAL_PROP, index)
|
||||
ldr r6, [r0, #4*index] // mulword
|
||||
|
||||
umull r10, r11, r6, r6
|
||||
|
||||
ldr r8, [r4, #8*index] // Load
|
||||
ldr r9, [r4, #8*index + 4] // Load
|
||||
|
||||
// Adding the square to the even column
|
||||
adcs r10, r10, r8 // carry from previous and update the flags
|
||||
|
||||
// Propagating the sum to the next column
|
||||
adcs r11, r11, r9 // This can generate a carry
|
||||
|
||||
str r10, [r4, #8*index] // Store
|
||||
str r11, [r4, #8*index + 4]// Store
|
||||
MACRO_END()
|
||||
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptFdefRawSquareAsm(
|
||||
// _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32) PCUINT32 pSrc,
|
||||
// UINT32 nDigits,
|
||||
// _Out_writes_(2*nWords) PUINT32 pDst )
|
||||
//
|
||||
// Initial inputs to registers:
|
||||
// pSrc -> r0
|
||||
// nDigits -> r1
|
||||
// pDst -> r2
|
||||
//
|
||||
// Register assignments
|
||||
// r0 = pSrc
|
||||
// r1 = negated word count of pSrc
|
||||
// r2 = pSrc (moving forward one digit / 4 words every inner loop)
|
||||
// r3 = negated digit count of pSrc
|
||||
// r4 = pDst (moving forward one digit every inner loop)
|
||||
// r5 = pDst (moving forward one word every outer loop)
|
||||
// r6 = mulword from pSrc
|
||||
// r7 = Stored negated digit count of pSrc
|
||||
// r8 = Current words loaded from pSrc
|
||||
// r9 = Cyclic counter for the jumps
|
||||
// r10, r11 = "64-bit" sliding register to hold the result of multiplies,
|
||||
// r10 also receives a word from pDst
|
||||
// r12 = Negated digit counter of pSrc (updated every 4 iterations of main loop)
|
||||
//
|
||||
// Stack assignments
|
||||
#define pDstSq 0 // Stored pDst in stack
|
||||
#define pSrc 4 // Stored pSrc in stack
|
||||
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefRawSquareAsm
|
||||
FUNCTION_START(SymCryptFdefRawSquareAsm, 3, 13)
|
||||
// push {r4-r12, lr}
|
||||
sub sp, sp, #8
|
||||
|
||||
|
||||
mov r3, r1 // digit count into r3
|
||||
|
||||
lsl r1, r1, #2 // Calculate word count
|
||||
|
||||
neg r1, r1 // negate nWords
|
||||
neg r3, r3 // negate nDigitsSq
|
||||
|
||||
mov r4, r2 // pDst
|
||||
mov r5, r2 // store pDst
|
||||
|
||||
str r0, [sp, #pSrc] // store pSrc
|
||||
str r5, [sp, #pDstSq] // store pDst
|
||||
mov r7, r3 // store -nDigits for later
|
||||
mov r12, r3 // Negated digit counter of pSrc
|
||||
|
||||
mov r2, r0 // inner loop pSrc
|
||||
|
||||
//
|
||||
// First iteration of main loop (no adding of previous values from pDst)
|
||||
//
|
||||
ands r11, r11, #0 // Clearing the carry flag and setting r11 = 0
|
||||
ldr r6, [r0] // load the first word from pSrc1
|
||||
str r11, [r4] // store 0 for the first word
|
||||
|
||||
b SymCryptFdefRawSquareAsmInnerLoopInit_Word1
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoopInit_Word0:
|
||||
SQR_SINGLEADD_32 0
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoopInit_Word1:
|
||||
SQR_SINGLEADD_32 1
|
||||
|
||||
SQR_SINGLEADD_32 2
|
||||
|
||||
SQR_SINGLEADD_32 3
|
||||
|
||||
|
||||
add r2, r2, #16
|
||||
add r4, r4, #16
|
||||
|
||||
adds r3, r3, #1 // move one digit up
|
||||
bne SymCryptFdefRawSquareAsmInnerLoopInit_Word0
|
||||
|
||||
str r11, [r4] // Store the next word into the destination
|
||||
add r1, r1, #2 // move two words up (so we stop when real word count is "-1")
|
||||
mov r9, #1 // Cyclic counter
|
||||
|
||||
//
|
||||
// MAIN LOOP
|
||||
//
|
||||
SymCryptFdefRawSquareAsmOuterLoop:
|
||||
|
||||
add r5, r5, #4 // move start of pDst one word up
|
||||
|
||||
mov r3, r12 // set -nDigits
|
||||
mov r2, r0 // set pSrc
|
||||
mov r4, r5 // set pDst
|
||||
|
||||
ands r11, r11, #0 // Clearing the carry flag and setting r11 = 0
|
||||
ldr r6, [r0, r9, LSL #2] // load the next word from pSrc
|
||||
|
||||
// Cyclic counter and jump logic
|
||||
add r9, r9, #1
|
||||
cmp r9, #1
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word1
|
||||
cmp r9, #2
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word2
|
||||
cmp r9, #3
|
||||
beq SymCryptFdefRawSquareAsmInnerLoop_Word3
|
||||
|
||||
// The following instructions are only executed when r9 == 4
|
||||
mov r9, #0 // Set it to 0
|
||||
|
||||
add r0, r0, #16 // move start of pSrc 4 words up
|
||||
add r5, r5, #16 // move pDst 4 words up
|
||||
|
||||
mov r2, r0 // set pSrc
|
||||
mov r4, r5 // set pDst
|
||||
|
||||
adds r3, r3, #1 // add 1 digit
|
||||
mov r12, r3 // set the new digit counter
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word0:
|
||||
SQR_DOUBLEADD_32 0
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word1:
|
||||
SQR_DOUBLEADD_32 1
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word2:
|
||||
SQR_DOUBLEADD_32 2
|
||||
|
||||
SymCryptFdefRawSquareAsmInnerLoop_Word3:
|
||||
SQR_DOUBLEADD_32 3
|
||||
|
||||
|
||||
add r2, r2, #16
|
||||
add r4, r4, #16
|
||||
adds r3, r3, #1 // move one digit up
|
||||
bne SymCryptFdefRawSquareAsmInnerLoop_Word0
|
||||
|
||||
str r11, [r4] // Store the next word into the destination
|
||||
|
||||
adds r1, r1, #1 // move one word up
|
||||
bne SymCryptFdefRawSquareAsmOuterLoop
|
||||
|
||||
eor r11, r11, r11 // Setting r11 = 0
|
||||
str r11, [r5, #20] // Store 0 to destination for the top word
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Second Pass - Shifting all results 1 bit left
|
||||
// Third Pass - Adding the squares on the even columns and propagating the sum
|
||||
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
mov r3, r7 // -nDigits
|
||||
lsl r3, r3, #1 // Double digits
|
||||
ldr r4, [sp, #pDstSq] // pDst pointer
|
||||
ands r1, r1, #0 // Clear the flags
|
||||
ands r2, r2, #0 // Clear the flags
|
||||
|
||||
SymCryptFdefRawSquareAsmSecondPass:
|
||||
rrxs r2, r2 // set the carry flag if bit[0] of r2 is set
|
||||
|
||||
ldmia r4, {r8, r9}
|
||||
adcs r8, r8, r8 // Shift left and add the carry
|
||||
adcs r9, r9, r9
|
||||
stmia r4!, {r8, r9}
|
||||
|
||||
ldmia r4, {r10, r11}
|
||||
adcs r10, r10, r10 // Shift left and add the carry
|
||||
adcs r11, r11, r11
|
||||
stmia r4!, {r10, r11}
|
||||
|
||||
adc r2, r1, r1
|
||||
|
||||
adds r3, r3, #1 // move one digit up
|
||||
bne SymCryptFdefRawSquareAsmSecondPass
|
||||
|
||||
|
||||
|
||||
ldr r0, [sp, #pSrc] // src pointer
|
||||
ldr r4, [sp, #pDstSq] // pDst pointer
|
||||
// mov r3, r7 // Use r7 as the digit counter
|
||||
// ands r1, r1, #0 // Clear the flags
|
||||
ands r2, r2, #0 // Clear the flags
|
||||
|
||||
SymCryptFdefRawSquareAsmThirdPass:
|
||||
rrxs r2, r2 // set the carry flag if bit[0] of r2 is set
|
||||
|
||||
SQR_DIAGONAL_PROP 0
|
||||
SQR_DIAGONAL_PROP 1
|
||||
SQR_DIAGONAL_PROP 2
|
||||
SQR_DIAGONAL_PROP 3
|
||||
|
||||
adc r2, r1, r1
|
||||
|
||||
add r0, r0, #16 // One digit up (not updated in SQR_DIAGONAL_PROP)
|
||||
add r4, r4, #32 // Two digits up (not updated in SQR_DIAGONAL_PROP)
|
||||
|
||||
adds r7, r7, #1 // move one digit up
|
||||
bne SymCryptFdefRawSquareAsmThirdPass
|
||||
|
||||
// Done, no return value
|
||||
|
||||
add sp, sp, #8
|
||||
// pop {r4-r12, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefRawSquareAsm
|
||||
FUNCTION_END(SymCryptFdefRawSquareAsm)
|
||||
|
||||
//VOID
|
||||
//SymCryptFdefMontgomeryReduceAsm(
|
||||
// _In_ PCSYMCRYPT_MODULUS pmMod,
|
||||
// _Inout_ PUINT32 pSrc,
|
||||
// _Out_ PUINT32 pDst )
|
||||
//
|
||||
// Initial inputs to registers:
|
||||
// pmMod -> r0
|
||||
// pSrc -> r1
|
||||
// pDst -> r2
|
||||
//
|
||||
// Register assignments
|
||||
// r0 = pMod (moving forward one *digit* every inner loop)
|
||||
// r1 = pSrc (moving forward one *digit* every inner loop)
|
||||
// r2 = Stored pSrc (moving forward one word every outer loop)
|
||||
// r3 = negated digit count of pSrc and pMod
|
||||
// r4 = negated word count of pSrc
|
||||
// r5, r6 = m = pSrc[i]*Inv64
|
||||
// r7 = hc = high carry variable
|
||||
// r8, r9 = Current words loaded from pMod
|
||||
// <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
|
||||
|
||||
// Stack assignments
|
||||
#define pMod 0 // Stored pMod
|
||||
#define pDst 4 // Stored pDst
|
||||
#define nDigits 8 // Stored negated digit count of pSrc
|
||||
#define inv64 12 // Inv64 of modulus
|
||||
|
||||
// LEAF_ENTRY SymCryptFdefMontgomeryReduceAsm
|
||||
FUNCTION_START(SymCryptFdefMontgomeryReduceAsm, 3, 13)
|
||||
// push {r4-r12, lr}
|
||||
sub sp, sp, #16
|
||||
|
||||
str r2, [sp, #pDst] // Store pDst in the stack
|
||||
ldr r3, [r0, #SymCryptModulusNdigitsOffsetArm] // # of Digits
|
||||
ldr r5, [r0, #SymCryptModulusInv64OffsetArm] // Inv64 of modulus
|
||||
add r0, r0, #SymCryptModulusValueOffsetArm // pMod
|
||||
str r5, [sp, #inv64] // Store inv64 in the stack
|
||||
|
||||
lsl r4, r3, #2 // Multiply by 4 to get the number of words
|
||||
|
||||
neg r3, r3 // Negate the digit count
|
||||
neg r4, r4 // Negate the word count
|
||||
|
||||
str r0, [sp, #pMod] // Store the pMod pointer
|
||||
mov r2, r1 // Store the pSrc pointer
|
||||
str r3, [sp, #nDigits] // Store the digit count for later
|
||||
|
||||
eor r7, r7, r7 // Set hc to 0
|
||||
|
||||
//
|
||||
// Main loop
|
||||
//
|
||||
SymCryptFdefMontgomeryReduceAsmOuter:
|
||||
ldr r3, [sp, #inv64] // Inv64 of modulus
|
||||
|
||||
ldmia r1, {r10, r12} // Load two words from pSrc
|
||||
ldmia r0, {r8,r9} // Load two words from pMod
|
||||
mov r11, #0
|
||||
mul r5, r10, r3 // <31:0> bits of pSrc[i]*Inv64 = m1 (first multiplier)
|
||||
umaal r10, r11, r5, r8 // r11 <-- High( m1*pMod[0] + pSrc[i] )
|
||||
umaal r12, r11, r5, r9 // Calculate pSrc[i+1] = Low( m1*pMod[1] + pSrc[i+1] + High( m1*pMod[0] + pSrc[i] ))
|
||||
mul r6, r12, r3 // <31:0> bits of pSrc[i+1]*Inv64 = m2
|
||||
|
||||
ldr r3, [sp, #nDigits] // Reset the digit counter
|
||||
mov r11, #0 // Set c to 0
|
||||
mov r12, #0 // Set c to 0
|
||||
|
||||
SymCryptFdefMontgomeryReduceAsmInner:
|
||||
adds r3, r3, #1 // Move one digit up (none of the commands updates the carry)
|
||||
|
||||
ldmia r0!, {r8, r9} // Load two words from pMod[]
|
||||
|
||||
ldr r10, [r1] // pSrc[j]
|
||||
umaal r10, r11, r5, r8 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
|
||||
str r10, [r1], #4 // pSrc[j] = (UINT32) c
|
||||
umaal r11, r12, r6, r8 // c = <r12:r11> = m2 * pMod[j] + c
|
||||
|
||||
ldr r10, [r1] // pSrc[j]
|
||||
umaal r10, r11, r5, r9 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
|
||||
str r10, [r1], #4 // pSrc[j] = (UINT32) c
|
||||
umaal r11, r12, r6, r9 // c = <r12:r11> = m2 * pMod[j] + c
|
||||
|
||||
ldmia r0!, {r8, r9} // Load two words from pMod[]
|
||||
|
||||
ldr r10, [r1] // pSrc[j]
|
||||
umaal r10, r11, r5, r8 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
|
||||
str r10, [r1], #4 // pSrc[j] = (UINT32) c
|
||||
umaal r11, r12, r6, r8 // c = <r12:r11> = m2 * pMod[j] + c
|
||||
|
||||
ldr r10, [r1] // pSrc[j]
|
||||
umaal r10, r11, r5, r9 // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
|
||||
str r10, [r1], #4 // pSrc[j] = (UINT32) c
|
||||
umaal r11, r12, r6, r9 // c = <r12:r11> = m2 * pMod[j] + c
|
||||
|
||||
bne SymCryptFdefMontgomeryReduceAsmInner
|
||||
|
||||
mov r8, #0 // r8 = 0
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
ldmia r1, {r5, r6} // Load pSrc[nWords] and pSrc[nWords+1]
|
||||
|
||||
adds r11, r11, r5 // c + pSrc[nWords]
|
||||
adc r8, r8, #0 // Add the carry if any
|
||||
adds r11, r11, r7 // c + pSrc[nWords] + hc
|
||||
adc r8, r8, #0 // Add the carry if any
|
||||
str r11, [r1], #4 // pSrc[nWords] = c
|
||||
|
||||
adds r12, r12, r6 // c + pSrc[nWords+1]
|
||||
adc r9, r9, #0 // Add the carry if any
|
||||
adds r12, r12, r8 // c + pSrc[nWords] + hc
|
||||
adc r7, r9, #0 // Add the carry if any
|
||||
str r12, [r1] // pSrc[nWords+1] = c
|
||||
|
||||
adds r4, r4, #2 // Move two words up
|
||||
|
||||
add r2, r2, #8 // Move stored pSrc pointer two words up
|
||||
ldr r0, [sp, #pMod] // Restore the pMod pointer
|
||||
mov r1, r2 // Restore the pSrc pointer
|
||||
|
||||
bne SymCryptFdefMontgomeryReduceAsmOuter
|
||||
|
||||
//
|
||||
// Subtraction
|
||||
//
|
||||
|
||||
// Prepare the pointers for subtract
|
||||
mov r0, r2 // pSrc
|
||||
mov r11, r2 // Store pSrc for later
|
||||
ldr r1, [sp, #pMod] // pMod
|
||||
ldr r2, [sp, #pDst] // pDst
|
||||
ldr r3, [sp, #nDigits] // Reset the digit counter
|
||||
|
||||
mov r10, r7 // r10 = hc
|
||||
|
||||
mov r8, #0 // borrow = r8 = 0
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
SymCryptFdefMontgomeryReduceRawSubAsmLoop:
|
||||
subs r8, r9, r8 // if r8>0 then the "borrow flag" is set
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
sbcs r4, r4, r5
|
||||
sbcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of pSrc1
|
||||
ldmia r1!, {r5, r7} // Load two words of pSrc2
|
||||
sbcs r4, r4, r5
|
||||
sbcs r6, r6, r7
|
||||
stmia r2!, {r4, r6} // Store the result in the destination
|
||||
|
||||
sbc r8, r9, r9 // If borrow=1, then r8 = -1 = 0xffffffff
|
||||
|
||||
adds r3, r3, #1 // Increment the digit count by one
|
||||
bne SymCryptFdefMontgomeryReduceRawSubAsmLoop
|
||||
|
||||
// Prepare the pointers for masked copy
|
||||
mov r0, r11 // pSrc
|
||||
ldr r1, [sp, #pDst] // pDst
|
||||
|
||||
and r9, r8, #1 // If r8>0, set the return value to 1
|
||||
orr r11, r10, r9 // r11 = hc|d
|
||||
|
||||
ldr r2, [sp, #nDigits] // Restore the digit counter
|
||||
|
||||
mov r9, #0 // r9 = 0
|
||||
|
||||
subs r4, r10, r11 // If (r11 > r10) clear the carry flag (i.e. borrow)
|
||||
sbc r3, r9, r9 // r3 = mask = 0xffffffff if the carry flag is clear
|
||||
// orn r9, r9, r3 // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
|
||||
orr r9, r9, r3
|
||||
mvn r9, r9
|
||||
mov r8, r1 // save the destination pointer
|
||||
|
||||
SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop:
|
||||
ldmia r0!, {r4, r6} // Load two words of the source
|
||||
ldmia r1!, {r5, r7} // Load two words of the destination
|
||||
and r4, r4, r3
|
||||
and r5, r5, r9
|
||||
orr r4, r4, r5
|
||||
and r6, r6, r3
|
||||
and r7, r7, r9
|
||||
orr r6, r6, r7
|
||||
stmia r8!, {r4, r6} // Store the two words in the destination
|
||||
|
||||
ldmia r0!, {r4, r6} // Load two words of the source
|
||||
ldmia r1!, {r5, r7} // Load two words of the destination
|
||||
and r4, r4, r3
|
||||
and r5, r5, r9
|
||||
orr r4, r4, r5
|
||||
and r6, r6, r3
|
||||
and r7, r7, r9
|
||||
orr r6, r6, r7
|
||||
stmia r8!, {r4, r6} // Store the two words in the destination
|
||||
|
||||
adds r2, r2, #1 // Increment the digit count by one
|
||||
bne SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
|
||||
|
||||
// Done, no return value
|
||||
|
||||
add sp, sp, #16
|
||||
// pop {r4-r12, pc}
|
||||
|
||||
// LEAF_END SymCryptFdefMontgomeryReduceAsm
|
||||
FUNCTION_END(SymCryptFdefMontgomeryReduceAsm)
|
|
@ -0,0 +1,20 @@
|
|||
// SymCryptWipe
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
// Secure wipe
|
||||
//
|
||||
|
||||
|
||||
// VOID
|
||||
// SYMCRYPT_CALL
|
||||
// SymCryptWipe( _Out_writes_bytes_( cbData ) PVOID pbData,
|
||||
// SIZE_T cbData )
|
||||
|
||||
FUNCTION_START(SymCryptWipeAsm, 2, 0)
|
||||
// we just jump to memset.
|
||||
// this is enough to stop the compiler optimizing the memset away.
|
||||
mov r2, r1
|
||||
mov r1, #0
|
||||
bl memset
|
||||
FUNCTION_END(SymCryptWipeAsm)
|
169
lib/sha256Par.c
169
lib/sha256Par.c
|
@ -825,48 +825,46 @@ SymCryptParallelSha256AppendBlocks_neon(
|
|||
//
|
||||
// This can probably be done faster, but we are missing the VTRN.64 instruction
|
||||
// which makes it hard to do this efficient in intrinsics.
|
||||
// The vsetq_lane_u32 seems to be completely ignored by the compiler.
|
||||
//
|
||||
ha[7].n128_u32[0] = pChain[0]->H[0];
|
||||
ha[7].n128_u32[1] = pChain[1]->H[0];
|
||||
ha[7].n128_u32[2] = pChain[2]->H[0];
|
||||
ha[7].n128_u32[3] = pChain[3]->H[0];
|
||||
|
||||
ha[6].n128_u32[0] = pChain[0]->H[1];
|
||||
ha[6].n128_u32[1] = pChain[1]->H[1];
|
||||
ha[6].n128_u32[2] = pChain[2]->H[1];
|
||||
ha[6].n128_u32[3] = pChain[3]->H[1];
|
||||
|
||||
ha[5].n128_u32[0] = pChain[0]->H[2];
|
||||
ha[5].n128_u32[1] = pChain[1]->H[2];
|
||||
ha[5].n128_u32[2] = pChain[2]->H[2];
|
||||
ha[5].n128_u32[3] = pChain[3]->H[2];
|
||||
|
||||
ha[4].n128_u32[0] = pChain[0]->H[3];
|
||||
ha[4].n128_u32[1] = pChain[1]->H[3];
|
||||
ha[4].n128_u32[2] = pChain[2]->H[3];
|
||||
ha[4].n128_u32[3] = pChain[3]->H[3];
|
||||
|
||||
ha[3].n128_u32[0] = pChain[0]->H[4];
|
||||
ha[3].n128_u32[1] = pChain[1]->H[4];
|
||||
ha[3].n128_u32[2] = pChain[2]->H[4];
|
||||
ha[3].n128_u32[3] = pChain[3]->H[4];
|
||||
|
||||
ha[2].n128_u32[0] = pChain[0]->H[5];
|
||||
ha[2].n128_u32[1] = pChain[1]->H[5];
|
||||
ha[2].n128_u32[2] = pChain[2]->H[5];
|
||||
ha[2].n128_u32[3] = pChain[3]->H[5];
|
||||
|
||||
ha[1].n128_u32[0] = pChain[0]->H[6];
|
||||
ha[1].n128_u32[1] = pChain[1]->H[6];
|
||||
ha[1].n128_u32[2] = pChain[2]->H[6];
|
||||
ha[1].n128_u32[3] = pChain[3]->H[6];
|
||||
|
||||
ha[0].n128_u32[0] = pChain[0]->H[7];
|
||||
ha[0].n128_u32[1] = pChain[1]->H[7];
|
||||
ha[0].n128_u32[2] = pChain[2]->H[7];
|
||||
ha[0].n128_u32[3] = pChain[3]->H[7];
|
||||
|
||||
ha[7] = vsetq_lane_u32( pChain[0]->H[0], ha[7], 0 );
|
||||
ha[7] = vsetq_lane_u32( pChain[1]->H[0], ha[7], 1 );
|
||||
ha[7] = vsetq_lane_u32( pChain[2]->H[0], ha[7], 2 );
|
||||
ha[7] = vsetq_lane_u32( pChain[3]->H[0], ha[7], 3 );
|
||||
|
||||
ha[6] = vsetq_lane_u32( pChain[0]->H[1], ha[6], 0 );
|
||||
ha[6] = vsetq_lane_u32( pChain[1]->H[1], ha[6], 1 );
|
||||
ha[6] = vsetq_lane_u32( pChain[2]->H[1], ha[6], 2 );
|
||||
ha[6] = vsetq_lane_u32( pChain[3]->H[1], ha[6], 3 );
|
||||
|
||||
ha[5] = vsetq_lane_u32( pChain[0]->H[2], ha[5], 0 );
|
||||
ha[5] = vsetq_lane_u32( pChain[1]->H[2], ha[5], 1 );
|
||||
ha[5] = vsetq_lane_u32( pChain[2]->H[2], ha[5], 2 );
|
||||
ha[5] = vsetq_lane_u32( pChain[3]->H[2], ha[5], 3 );
|
||||
|
||||
ha[4] = vsetq_lane_u32( pChain[0]->H[3], ha[4], 0 );
|
||||
ha[4] = vsetq_lane_u32( pChain[1]->H[3], ha[4], 1 );
|
||||
ha[4] = vsetq_lane_u32( pChain[2]->H[3], ha[4], 2 );
|
||||
ha[4] = vsetq_lane_u32( pChain[3]->H[3], ha[4], 3 );
|
||||
|
||||
ha[3] = vsetq_lane_u32( pChain[0]->H[4], ha[3], 0 );
|
||||
ha[3] = vsetq_lane_u32( pChain[1]->H[4], ha[3], 1 );
|
||||
ha[3] = vsetq_lane_u32( pChain[2]->H[4], ha[3], 2 );
|
||||
ha[3] = vsetq_lane_u32( pChain[3]->H[4], ha[3], 3 );
|
||||
|
||||
ha[2] = vsetq_lane_u32( pChain[0]->H[5], ha[2], 0 );
|
||||
ha[2] = vsetq_lane_u32( pChain[1]->H[5], ha[2], 1 );
|
||||
ha[2] = vsetq_lane_u32( pChain[2]->H[5], ha[2], 2 );
|
||||
ha[2] = vsetq_lane_u32( pChain[3]->H[5], ha[2], 3 );
|
||||
|
||||
ha[1] = vsetq_lane_u32( pChain[0]->H[6], ha[1], 0 );
|
||||
ha[1] = vsetq_lane_u32( pChain[1]->H[6], ha[1], 1 );
|
||||
ha[1] = vsetq_lane_u32( pChain[2]->H[6], ha[1], 2 );
|
||||
ha[1] = vsetq_lane_u32( pChain[3]->H[6], ha[1], 3 );
|
||||
|
||||
ha[0] = vsetq_lane_u32( pChain[0]->H[7], ha[0], 0 );
|
||||
ha[0] = vsetq_lane_u32( pChain[1]->H[7], ha[0], 1 );
|
||||
ha[0] = vsetq_lane_u32( pChain[2]->H[7], ha[0], 2 );
|
||||
ha[0] = vsetq_lane_u32( pChain[3]->H[7], ha[0], 3 );
|
||||
|
||||
buf[0] = ha[4];
|
||||
buf[1] = ha[5];
|
||||
|
@ -881,10 +879,10 @@ SymCryptParallelSha256AppendBlocks_neon(
|
|||
//
|
||||
for( r=0; r<16; r ++ )
|
||||
{
|
||||
T0.n128_u32[0] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ); ppByte[0] += 4;
|
||||
T0.n128_u32[1] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ); ppByte[1] += 4;
|
||||
T0.n128_u32[2] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ); ppByte[2] += 4;
|
||||
T0.n128_u32[3] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ); ppByte[3] += 4;
|
||||
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ), T0, 0 ); ppByte[0] += 4;
|
||||
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ), T0, 1 ); ppByte[1] += 4;
|
||||
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ), T0, 2 ); ppByte[2] += 4;
|
||||
T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ), T0, 3 ); ppByte[3] += 4;
|
||||
W[r] = T0;
|
||||
}
|
||||
|
||||
|
@ -964,47 +962,46 @@ SymCryptParallelSha256AppendBlocks_neon(
|
|||
//
|
||||
// Copy the chaining state back into the hash structure
|
||||
//
|
||||
|
||||
pChain[0]->H[0] = ha[7].n128_u32[0];
|
||||
pChain[1]->H[0] = ha[7].n128_u32[1];
|
||||
pChain[2]->H[0] = ha[7].n128_u32[2];
|
||||
pChain[3]->H[0] = ha[7].n128_u32[3];
|
||||
|
||||
pChain[0]->H[1] = ha[6].n128_u32[0];
|
||||
pChain[1]->H[1] = ha[6].n128_u32[1];
|
||||
pChain[2]->H[1] = ha[6].n128_u32[2];
|
||||
pChain[3]->H[1] = ha[6].n128_u32[3];
|
||||
|
||||
pChain[0]->H[2] = ha[5].n128_u32[0];
|
||||
pChain[1]->H[2] = ha[5].n128_u32[1];
|
||||
pChain[2]->H[2] = ha[5].n128_u32[2];
|
||||
pChain[3]->H[2] = ha[5].n128_u32[3];
|
||||
|
||||
pChain[0]->H[3] = ha[4].n128_u32[0];
|
||||
pChain[1]->H[3] = ha[4].n128_u32[1];
|
||||
pChain[2]->H[3] = ha[4].n128_u32[2];
|
||||
pChain[3]->H[3] = ha[4].n128_u32[3];
|
||||
|
||||
pChain[0]->H[4] = ha[3].n128_u32[0];
|
||||
pChain[1]->H[4] = ha[3].n128_u32[1];
|
||||
pChain[2]->H[4] = ha[3].n128_u32[2];
|
||||
pChain[3]->H[4] = ha[3].n128_u32[3];
|
||||
|
||||
pChain[0]->H[5] = ha[2].n128_u32[0];
|
||||
pChain[1]->H[5] = ha[2].n128_u32[1];
|
||||
pChain[2]->H[5] = ha[2].n128_u32[2];
|
||||
pChain[3]->H[5] = ha[2].n128_u32[3];
|
||||
|
||||
pChain[0]->H[6] = ha[1].n128_u32[0];
|
||||
pChain[1]->H[6] = ha[1].n128_u32[1];
|
||||
pChain[2]->H[6] = ha[1].n128_u32[2];
|
||||
pChain[3]->H[6] = ha[1].n128_u32[3];
|
||||
|
||||
pChain[0]->H[7] = ha[0].n128_u32[0];
|
||||
pChain[1]->H[7] = ha[0].n128_u32[1];
|
||||
pChain[2]->H[7] = ha[0].n128_u32[2];
|
||||
pChain[3]->H[7] = ha[0].n128_u32[3];
|
||||
|
||||
pChain[0]->H[0] = vgetq_lane_u32( ha[7], 0 );
|
||||
pChain[1]->H[0] = vgetq_lane_u32( ha[7], 1 );
|
||||
pChain[2]->H[0] = vgetq_lane_u32( ha[7], 2 );
|
||||
pChain[3]->H[0] = vgetq_lane_u32( ha[7], 3 );
|
||||
|
||||
pChain[0]->H[1] = vgetq_lane_u32( ha[6], 0 );
|
||||
pChain[1]->H[1] = vgetq_lane_u32( ha[6], 1 );
|
||||
pChain[2]->H[1] = vgetq_lane_u32( ha[6], 2 );
|
||||
pChain[3]->H[1] = vgetq_lane_u32( ha[6], 3 );
|
||||
|
||||
pChain[0]->H[2] = vgetq_lane_u32( ha[5], 0 );
|
||||
pChain[1]->H[2] = vgetq_lane_u32( ha[5], 1 );
|
||||
pChain[2]->H[2] = vgetq_lane_u32( ha[5], 2 );
|
||||
pChain[3]->H[2] = vgetq_lane_u32( ha[5], 3 );
|
||||
|
||||
pChain[0]->H[3] = vgetq_lane_u32( ha[4], 0 );
|
||||
pChain[1]->H[3] = vgetq_lane_u32( ha[4], 1 );
|
||||
pChain[2]->H[3] = vgetq_lane_u32( ha[4], 2 );
|
||||
pChain[3]->H[3] = vgetq_lane_u32( ha[4], 3 );
|
||||
|
||||
pChain[0]->H[4] = vgetq_lane_u32( ha[3], 0 );
|
||||
pChain[1]->H[4] = vgetq_lane_u32( ha[3], 1 );
|
||||
pChain[2]->H[4] = vgetq_lane_u32( ha[3], 2 );
|
||||
pChain[3]->H[4] = vgetq_lane_u32( ha[3], 3 );
|
||||
|
||||
pChain[0]->H[5] = vgetq_lane_u32( ha[2], 0 );
|
||||
pChain[1]->H[5] = vgetq_lane_u32( ha[2], 1 );
|
||||
pChain[2]->H[5] = vgetq_lane_u32( ha[2], 2 );
|
||||
pChain[3]->H[5] = vgetq_lane_u32( ha[2], 3 );
|
||||
|
||||
pChain[0]->H[6] = vgetq_lane_u32( ha[1], 0 );
|
||||
pChain[1]->H[6] = vgetq_lane_u32( ha[1], 1 );
|
||||
pChain[2]->H[6] = vgetq_lane_u32( ha[1], 2 );
|
||||
pChain[3]->H[6] = vgetq_lane_u32( ha[1], 3 );
|
||||
|
||||
pChain[0]->H[7] = vgetq_lane_u32( ha[0], 0 );
|
||||
pChain[1]->H[7] = vgetq_lane_u32( ha[0], 1 );
|
||||
pChain[2]->H[7] = vgetq_lane_u32( ha[0], 2 );
|
||||
pChain[3]->H[7] = vgetq_lane_u32( ha[0], 3 );
|
||||
|
||||
SymCryptWipeKnownSize( buf, sizeof( buf ) );
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ include ksamd64.inc
|
|||
#define GET_SYMBOL_ADDRESS(__symbol) __symbol@plt+rip
|
||||
#define HEX(__constant) 0x##__constant
|
||||
#define TEXTAREA()
|
||||
#define EXTERN(__label)
|
||||
#define EXTERN(__label) .extern __label
|
||||
#define LABEL(__labelname) __labelname:
|
||||
|
||||
#else
|
||||
|
|
|
@ -19,7 +19,7 @@ set(KEEP_SYMBOL_ARGS
|
|||
)
|
||||
|
||||
# Determine the which executable to use for stripping binaries
|
||||
if(SYMCRYPT_TARGET_ARCH MATCHES ARM64 AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
|
||||
if(SYMCRYPT_TARGET_ARCH MATCHES ARM AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
|
||||
set(STRIP_COMMAND ${TARGET_TRIPLE}-strip)
|
||||
set(OBJCOPY_COMMAND ${TARGET_TRIPLE}-objcopy)
|
||||
else()
|
||||
|
|
|
@ -8,8 +8,152 @@
|
|||
|
||||
#include "precomp.h"
|
||||
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
#define Elf_Shdr Elf32_Shdr
|
||||
#define Elf_Phdr Elf32_Phdr
|
||||
#define Elf_Sym Elf32_Sym
|
||||
#define Elf_Dyn Elf32_Dyn
|
||||
#define Elf_Ehdr Elf32_Ehdr
|
||||
#define Elf_Addr Elf32_Addr
|
||||
#define Elf_Off Elf32_Off
|
||||
#define Elf_Rel Elf32_Rel
|
||||
#define Elf_Rela Elf32_Rela
|
||||
#define ELF_R_TYPE(X) ELF32_R_TYPE(X)
|
||||
#define ELF_R_SYM(X) ELF32_R_SYM(X)
|
||||
#define Elf_Word Elf32_Word
|
||||
#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ32
|
||||
#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
|
||||
#define Elf_Shdr Elf64_Shdr
|
||||
#define Elf_Phdr Elf64_Phdr
|
||||
#define Elf_Sym Elf64_Sym
|
||||
#define Elf_Dyn Elf64_Dyn
|
||||
#define Elf_Ehdr Elf64_Ehdr
|
||||
#define Elf_Addr Elf64_Addr
|
||||
#define Elf_Off Elf64_Off
|
||||
#define Elf_Rel Elf64_Rel
|
||||
#define Elf_Rela Elf64_Rela
|
||||
#define ELF_R_TYPE(X) ELF64_R_TYPE(X)
|
||||
#define ELF_R_SYM(X) ELF64_R_SYM(X)
|
||||
#define Elf_Word Elf64_Word
|
||||
#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ64
|
||||
#else
|
||||
#error Unknown CPU pointer size
|
||||
#endif
|
||||
|
||||
#ifdef SYMCRYPT_DEBUG_INTEGRITY
|
||||
#include <stdio.h>
|
||||
|
||||
VOID
|
||||
DbgDumpHex(PCBYTE pbData, SIZE_T cbData)
|
||||
{
|
||||
ULONG i,count;
|
||||
CHAR digits[]="0123456789abcdef";
|
||||
CHAR pbLine[256];
|
||||
ULONG cbLine, cbHeader = 0;
|
||||
ULONG_PTR address;
|
||||
|
||||
if(pbData == NULL && cbData != 0)
|
||||
{
|
||||
// strcat_s(pbLine, RTL_NUMBER_OF(pbLine), "<null> buffer!!!\n");
|
||||
fprintf(stderr, "<null> buffer!!!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
for(; cbData ; cbData -= count, pbData += count)
|
||||
{
|
||||
count = (cbData > 16) ? 16:cbData;
|
||||
|
||||
cbLine = cbHeader;
|
||||
|
||||
address = (ULONG_PTR)pbData;
|
||||
|
||||
#if UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
|
||||
// 64 bit addresses.
|
||||
pbLine[cbLine++] = digits[(address >> 0x3c) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x38) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x34) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x30) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x2c) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x28) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x24) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x20) & 0x0f];
|
||||
#endif
|
||||
pbLine[cbLine++] = digits[(address >> 0x1c) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x18) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x14) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x10) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x0c) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x08) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address >> 0x04) & 0x0f];
|
||||
pbLine[cbLine++] = digits[(address ) & 0x0f];
|
||||
pbLine[cbLine++] = ' ';
|
||||
pbLine[cbLine++] = ' ';
|
||||
|
||||
for(i = 0; i < count; i++)
|
||||
{
|
||||
pbLine[cbLine++] = digits[pbData[i]>>4];
|
||||
pbLine[cbLine++] = digits[pbData[i]&0x0f];
|
||||
if(i == 7)
|
||||
{
|
||||
pbLine[cbLine++] = ':';
|
||||
}
|
||||
else
|
||||
{
|
||||
pbLine[cbLine++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
for(; i < 16; i++)
|
||||
{
|
||||
pbLine[cbLine++] = ' ';
|
||||
pbLine[cbLine++] = ' ';
|
||||
pbLine[cbLine++] = ' ';
|
||||
}
|
||||
|
||||
pbLine[cbLine++] = ' ';
|
||||
|
||||
for(i = 0; i < count; i++)
|
||||
{
|
||||
if(pbData[i] < 32 || pbData[i] > 126)
|
||||
{
|
||||
pbLine[cbLine++] = '.';
|
||||
}
|
||||
else
|
||||
{
|
||||
pbLine[cbLine++] = pbData[i];
|
||||
}
|
||||
}
|
||||
|
||||
pbLine[cbLine++] = 0;
|
||||
|
||||
fprintf(stderr, "%s\n", pbLine);
|
||||
}
|
||||
}
|
||||
|
||||
VOID
|
||||
SYMCRYPT_CALL
|
||||
SymCryptHmacSha256AppendDbg(
|
||||
_In_ CHAR* pszLabel,
|
||||
_Inout_ PSYMCRYPT_HMAC_SHA256_STATE pState,
|
||||
_In_reads_( cbData ) PCBYTE pbData,
|
||||
SIZE_T cbData )
|
||||
{
|
||||
fprintf(stderr, "\nHMAC append: %s size %lx\n", pszLabel, cbData);
|
||||
DbgDumpHexString(pbData, (ULONG)cbData);
|
||||
SymCryptHmacSha256Append(pState, pbData, cbData);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// These placeholder vaulues must match the values in process_fips_module.py
|
||||
#if UINTPTR_MAX == 0xFFFFFFFF
|
||||
#define PLACEHOLDER_VALUE 0x8BADF00D
|
||||
#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
|
||||
#define PLACEHOLDER_VALUE 0x4BADF00D8BADF00D
|
||||
#else
|
||||
#error Unknown CPU pointer size
|
||||
#endif
|
||||
|
||||
#define PLACEHOLDER_ARRAY \
|
||||
{\
|
||||
0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,\
|
||||
|
@ -29,11 +173,11 @@
|
|||
|
||||
// Relative virtual address of the HMAC key. Used to calculate where the module starts in memory
|
||||
// at runtime.
|
||||
const Elf64_Addr SymCryptVolatileFipsHmacKeyRva = (Elf64_Addr) PLACEHOLDER_VALUE;
|
||||
const Elf_Addr SymCryptVolatileFipsHmacKeyRva = (Elf_Addr) PLACEHOLDER_VALUE;
|
||||
|
||||
// Offset to the end of the FIPS module. Bytes after this offset are not considered part of our
|
||||
// FIPS module and are not included in the HMAC digest.
|
||||
const Elf64_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;
|
||||
const Elf_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;
|
||||
|
||||
// Key used for HMAC.
|
||||
const unsigned char SymCryptVolatileFipsHmacKey[32] = PLACEHOLDER_ARRAY;
|
||||
|
@ -44,32 +188,43 @@ unsigned char SymCryptVolatileFipsHmacDigest[SYMCRYPT_HMAC_SHA256_RESULT_SIZE] =
|
|||
|
||||
typedef struct
|
||||
{
|
||||
Elf64_Rela* rela;
|
||||
Elf_Rela* rela;
|
||||
Elf_Rel* rel;
|
||||
size_t relaEntryCount;
|
||||
Elf64_Rela* pltRela;
|
||||
size_t relEntryCount;
|
||||
union {
|
||||
Elf_Rela* rela;
|
||||
Elf_Rel* rel;
|
||||
Elf_Addr addr;
|
||||
} plt;
|
||||
size_t pltRelaEntryCount;
|
||||
} Elf64_Rela_Info;
|
||||
Elf_Addr pltRelAddendType;
|
||||
} Elf_Rela_Info;
|
||||
|
||||
|
||||
VOID SymCryptModuleUndoRelocation(
|
||||
_In_ const Elf64_Addr module_base,
|
||||
_Inout_ Elf64_Xword* const target,
|
||||
_In_ const Elf64_Rela* rela )
|
||||
_In_ const Elf_Addr module_base,
|
||||
_Inout_ Elf_Addr* const target,
|
||||
_In_ const Elf_Word relType )
|
||||
{
|
||||
Elf64_Xword replacement = 0;
|
||||
Elf_Addr replacement = 0;
|
||||
|
||||
switch( ELF64_R_TYPE( rela->r_info ) )
|
||||
switch( relType )
|
||||
{
|
||||
case R_X86_64_RELATIVE:
|
||||
case R_AARCH64_RELATIVE:
|
||||
replacement = *target - (Elf64_Off) module_base;
|
||||
case R_ARM_RELATIVE:
|
||||
replacement = *target - (Elf_Off) module_base;
|
||||
break;
|
||||
case R_X86_64_64:
|
||||
case R_X86_64_GLOB_DAT:
|
||||
case R_X86_64_JUMP_SLOT:
|
||||
case R_AARCH64_GLOB_DAT:
|
||||
case R_AARCH64_JUMP_SLOT:
|
||||
// R_X86_64_64, R_X86_64_GLOB_DAT and R_AARCH64_GLOB_DAT relocations all have initial
|
||||
// values of zero. R_X86_64_JUMP_SLOT and R_AARCH64_JUMP_SLOT relocations have initial
|
||||
case R_ARM_GLOB_DAT:
|
||||
case R_ARM_JUMP_SLOT:
|
||||
// R_X86_64_64 and R_*_GLOB_DAT relocations all have initial
|
||||
// values of zero. R_*_JUMP_SLOT relocations have initial
|
||||
// values that point into the PLT, but we set these to zero in our post-processing
|
||||
// script before HMACing the module. These relocation targets are excluded from our
|
||||
// FIPS module boundary because they're used for external function calls, which we
|
||||
|
@ -87,36 +242,51 @@ VOID SymCryptModuleUndoRelocation(
|
|||
}
|
||||
|
||||
VOID SymCryptModuleFindRelocationInfo(
|
||||
_In_ const Elf64_Dyn* const dynStart,
|
||||
_Out_ Elf64_Rela_Info* relaInfo)
|
||||
_In_ const Elf_Dyn* const dynStart,
|
||||
_Out_ Elf_Rela_Info* relaInfo)
|
||||
{
|
||||
relaInfo->rela = NULL;
|
||||
relaInfo->relaEntryCount = 0;
|
||||
relaInfo->pltRela = NULL;
|
||||
relaInfo->relEntryCount = 0;
|
||||
relaInfo->plt.addr = 0;
|
||||
relaInfo->pltRelaEntryCount = 0;
|
||||
|
||||
size_t relaTotalSize = 0;
|
||||
size_t relTotalSize = 0;
|
||||
size_t relaEntrySize = 0;
|
||||
size_t relEntrySize = 0;
|
||||
size_t pltTotalSize = 0;
|
||||
|
||||
for( const Elf64_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
|
||||
for( const Elf_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
|
||||
{
|
||||
switch( dyn->d_tag )
|
||||
{
|
||||
case DT_RELA:
|
||||
relaInfo->rela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
|
||||
relaInfo->rela = ( Elf_Rela* ) dyn->d_un.d_ptr;
|
||||
break;
|
||||
|
||||
case DT_REL:
|
||||
relaInfo->rel = ( Elf_Rel* ) dyn->d_un.d_ptr;
|
||||
break;
|
||||
|
||||
case DT_RELASZ:
|
||||
relaTotalSize = dyn->d_un.d_val;
|
||||
break;
|
||||
|
||||
case DT_RELSZ:
|
||||
relTotalSize = dyn->d_un.d_val;
|
||||
break;
|
||||
|
||||
case DT_RELAENT:
|
||||
relaEntrySize = dyn->d_un.d_val;
|
||||
break;
|
||||
|
||||
case DT_RELENT:
|
||||
relEntrySize = dyn->d_un.d_val;
|
||||
break;
|
||||
|
||||
case DT_JMPREL:
|
||||
relaInfo->pltRela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
|
||||
relaInfo->plt.addr = ( Elf_Addr ) dyn->d_un.d_ptr;
|
||||
break;
|
||||
|
||||
case DT_PLTRELSZ:
|
||||
|
@ -124,9 +294,8 @@ VOID SymCryptModuleFindRelocationInfo(
|
|||
break;
|
||||
|
||||
case DT_PLTREL:
|
||||
// Make sure PLT entries are DT_RELA entries and not DT_REL; we do not suppport
|
||||
// the latter
|
||||
SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA );
|
||||
SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA || dyn->d_un.d_val == DT_REL );
|
||||
relaInfo->pltRelAddendType = dyn->d_un.d_val;
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -134,26 +303,45 @@ VOID SymCryptModuleFindRelocationInfo(
|
|||
}
|
||||
}
|
||||
|
||||
SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL );
|
||||
SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf64_Rela ) );
|
||||
SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
|
||||
// Need to have at least one type of relocations.
|
||||
SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL || relaInfo->rel != NULL );
|
||||
|
||||
relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
|
||||
|
||||
// On AMD64 there should not be a PLT section, because we can't currently handle AMD64 PLT
|
||||
// relocations
|
||||
if( relaInfo->pltRela != NULL)
|
||||
if ( relaInfo->rela != NULL)
|
||||
{
|
||||
SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 && pltTotalSize % sizeof( Elf64_Rela ) == 0 );
|
||||
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf64_Rela );
|
||||
SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf_Rela ) );
|
||||
SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
|
||||
relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
|
||||
}
|
||||
if ( relaInfo->rel != NULL )
|
||||
{
|
||||
SYMCRYPT_FIPS_ASSERT( relaInfo->rel != NULL );
|
||||
SYMCRYPT_FIPS_ASSERT( relEntrySize == sizeof( Elf_Rel ) );
|
||||
SYMCRYPT_FIPS_ASSERT( relTotalSize != 0 && relTotalSize % relEntrySize == 0 );
|
||||
relaInfo->relEntryCount = relTotalSize / relEntrySize;
|
||||
}
|
||||
|
||||
if( relaInfo->plt.addr != 0)
|
||||
{
|
||||
SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 );
|
||||
// PLT relocations are either all rel or all rela.
|
||||
if ( relaInfo->pltRelAddendType == DT_RELA )
|
||||
{
|
||||
SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rela ) == 0 );
|
||||
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rela );
|
||||
}
|
||||
else
|
||||
{
|
||||
SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rel ) == 0 );
|
||||
relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rel );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t SymCryptModuleProcessSegmentWithRelocations(
|
||||
_In_ const Elf64_Addr module_base,
|
||||
_In_ const Elf64_Phdr* const programHeader,
|
||||
_In_ const Elf64_Dyn* const dynStart,
|
||||
_In_ const Elf64_Rela_Info* const relaInfo,
|
||||
_In_ const Elf_Addr module_base,
|
||||
_In_ const Elf_Phdr* const programHeader,
|
||||
_In_ const Elf_Dyn* const dynStart,
|
||||
_In_ const Elf_Rela_Info* const relaInfo,
|
||||
_Inout_ SYMCRYPT_HMAC_SHA256_STATE* hmacState )
|
||||
{
|
||||
// The segment that contains relocations consists of the following sections, in this order:
|
||||
|
@ -169,10 +357,19 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
|
|||
// the module on disk is usually a different size than at runtime, so we cannot include it in
|
||||
// our HMAC either.
|
||||
//
|
||||
// In arm (32 bit) relocations also exist in .text section so we'll do SymCryptModuleProcessSegmentWithRelocations
|
||||
// on every section.
|
||||
//
|
||||
// FipsBoundaryOffset marks the start of the .data section, so we read from the start of the
|
||||
// segment up to that offset.
|
||||
size_t hashableSectionSize = SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
|
||||
Elf64_Addr segmentStart = module_base + programHeader->p_vaddr;
|
||||
size_t hashableSectionSize = programHeader->p_filesz;
|
||||
Elf_Addr segmentStart = module_base + programHeader->p_vaddr;
|
||||
|
||||
// If the data section is in this segment then exclude that from being hashed.
|
||||
if( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
|
||||
{
|
||||
hashableSectionSize = SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
|
||||
}
|
||||
|
||||
BYTE* segmentCopy = SymCryptCallbackAlloc( hashableSectionSize );
|
||||
SYMCRYPT_FIPS_ASSERT( segmentCopy != NULL );
|
||||
|
@ -184,63 +381,103 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
|
|||
// these relocations separately. We find the .dynamic section in the copied buffer based on
|
||||
// its offset from the start of the section, which is calculated by subtracting the address
|
||||
// of the start of the segment from the address of the .dynamic section in the segment.
|
||||
Elf64_Off dynOffsetInBuffer = (Elf64_Addr) dynStart - (Elf64_Addr) segmentStart;
|
||||
Elf64_Dyn* dynStartInBuffer = (Elf64_Dyn*) (segmentCopy + dynOffsetInBuffer);
|
||||
Elf_Off dynOffsetInBuffer = (Elf_Addr) dynStart - (Elf_Addr) segmentStart;
|
||||
Elf_Dyn* dynStartInBuffer = (Elf_Dyn*) (segmentCopy + dynOffsetInBuffer);
|
||||
|
||||
for( Elf64_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
|
||||
// If this segment contains the dynamic section then we need to process the relocations in it.
|
||||
if ((Elf_Addr)dynStart > segmentStart && (Elf_Addr)dynStart < segmentStart + hashableSectionSize)
|
||||
{
|
||||
// The following types of .dynamic entries have the module's base address added to
|
||||
// their initial value
|
||||
if( dyn->d_tag == DT_HASH ||
|
||||
dyn->d_tag == DT_STRTAB ||
|
||||
dyn->d_tag == DT_SYMTAB ||
|
||||
dyn->d_tag == DT_RELA ||
|
||||
dyn->d_tag == DT_GNU_HASH ||
|
||||
dyn->d_tag == DT_VERSYM ||
|
||||
dyn->d_tag == DT_PLTGOT ||
|
||||
dyn->d_tag == DT_JMPREL)
|
||||
for( Elf_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
|
||||
{
|
||||
dyn->d_un.d_val -= (Elf64_Xword) module_base;
|
||||
// The following types of .dynamic entries have the module's base address added to
|
||||
// their initial value
|
||||
if( dyn->d_tag == DT_HASH ||
|
||||
dyn->d_tag == DT_STRTAB ||
|
||||
dyn->d_tag == DT_SYMTAB ||
|
||||
dyn->d_tag == DT_RELA ||
|
||||
dyn->d_tag == DT_GNU_HASH ||
|
||||
dyn->d_tag == DT_VERSYM ||
|
||||
dyn->d_tag == DT_PLTGOT ||
|
||||
dyn->d_tag == DT_JMPREL ||
|
||||
dyn->d_tag == DT_REL)
|
||||
{
|
||||
dyn->d_un.d_val -= module_base;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now we can process the normal relocations listed in the relocation table
|
||||
for( size_t i = 0; i < relaInfo->relaEntryCount; ++i )
|
||||
{
|
||||
const Elf64_Rela* rela = relaInfo->rela + i;
|
||||
const Elf_Rela* rela = relaInfo->rela + i;
|
||||
|
||||
// Find the relocation within the section. Note that for a shared object module,
|
||||
// rela->r_offset is actually a virtual address. Relocations can occur within the .data
|
||||
// section, which is outside our FIPS boundary, so any such relocations can be ignored.
|
||||
Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
|
||||
Elf_Off offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
|
||||
if( offsetInBuffer > hashableSectionSize )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
|
||||
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
|
||||
|
||||
SymCryptModuleUndoRelocation( module_base, target, rela );
|
||||
SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rela->r_info ) );
|
||||
}
|
||||
|
||||
// Process the GOT entries from the .rela.plt section. Same as process above, just
|
||||
for( size_t i = 0; i < relaInfo->relEntryCount; ++i )
|
||||
{
|
||||
const Elf_Rel* rel = relaInfo->rel + i;
|
||||
|
||||
// Find the relocation within the section. Note that for a shared object module,
|
||||
// rela->r_offset is actually a virtual address. Relocations can occur within the .data
|
||||
// section, which is outside our FIPS boundary, so any such relocations can be ignored.
|
||||
Elf_Off offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
|
||||
if( offsetInBuffer > hashableSectionSize )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
|
||||
|
||||
SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rel->r_info ) );
|
||||
}
|
||||
|
||||
// Process the GOT entries from the .rela.plt or .rel.plt section. Same as process above, just
|
||||
// with a different table.
|
||||
for( size_t i = 0; i < relaInfo->pltRelaEntryCount; ++i)
|
||||
{
|
||||
const Elf64_Rela* rela = relaInfo->pltRela + i;
|
||||
Elf_Word type = 0;
|
||||
Elf_Off offsetInBuffer = 0;
|
||||
|
||||
if (relaInfo->pltRelAddendType == DT_RELA)
|
||||
{
|
||||
const Elf_Rela* rela = relaInfo->plt.rela + i;
|
||||
type = ELF_R_TYPE( rela->r_info );
|
||||
offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
|
||||
}
|
||||
else
|
||||
{
|
||||
const Elf_Rel* rel = relaInfo->plt.rel + i;
|
||||
type = ELF_R_TYPE( rel->r_info );
|
||||
offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
|
||||
}
|
||||
|
||||
Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
|
||||
if( offsetInBuffer > hashableSectionSize )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
|
||||
Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
|
||||
|
||||
SymCryptModuleUndoRelocation( module_base, target, rela );
|
||||
SymCryptModuleUndoRelocation( module_base, target, type );
|
||||
}
|
||||
|
||||
#if SYMCRYPT_DEBUG_INTEGRITY
|
||||
SymCryptHmacSha256AppendDbg( "Append after relocation adjust", hmacState, segmentCopy, hashableSectionSize );
|
||||
#else
|
||||
SymCryptHmacSha256Append( hmacState, segmentCopy, hashableSectionSize );
|
||||
#endif
|
||||
|
||||
SymCryptCallbackFree( segmentCopy );
|
||||
|
||||
|
@ -248,9 +485,9 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
|
|||
}
|
||||
|
||||
VOID SymCryptModuleDoHmac(
|
||||
_In_ const Elf64_Addr module_base,
|
||||
_In_ const Elf64_Dyn* const dynStart,
|
||||
_In_ const Elf64_Rela_Info* const relaInfo )
|
||||
_In_ const Elf_Addr module_base,
|
||||
_In_ const Elf_Dyn* const dynStart,
|
||||
_In_ const Elf_Rela_Info* const relaInfo )
|
||||
{
|
||||
SYMCRYPT_ERROR scError = SYMCRYPT_NO_ERROR;
|
||||
SYMCRYPT_HMAC_SHA256_EXPANDED_KEY hmacKey;
|
||||
|
@ -263,37 +500,20 @@ VOID SymCryptModuleDoHmac(
|
|||
|
||||
SymCryptHmacSha256Init( &hmacState, &hmacKey );
|
||||
|
||||
const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
|
||||
const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
|
||||
const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
|
||||
const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );
|
||||
|
||||
for( const Elf64_Phdr* programHeader = programHeaderStart;
|
||||
for( const Elf_Phdr* programHeader = programHeaderStart;
|
||||
programHeader->p_type == PT_LOAD; ++programHeader )
|
||||
{
|
||||
// Sometimes the virtual address of a segment is greater than its offset into the module
|
||||
// file on disk. This means extra NULL bytes will be inserted into the module's memory
|
||||
// space at runtime. Those bytes are not part of our FIPS boundary, so we skip over them
|
||||
// and always start reading from the segment's virtual address
|
||||
Elf64_Addr segmentStart = module_base + (Elf64_Off) programHeader->p_vaddr;
|
||||
Elf_Addr segmentStart = module_base + (Elf_Off) programHeader->p_vaddr;
|
||||
|
||||
if( (programHeader->p_flags & PF_W) == PF_W &&
|
||||
SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
|
||||
{
|
||||
// If we are processing the final writable segment (containing the .data section which
|
||||
// marks the end of our FIPS boundary), then we need to reverse relocations in it
|
||||
SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
|
||||
SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
|
||||
relaInfo, &hmacState );
|
||||
}
|
||||
else
|
||||
{
|
||||
// For AMD64/ARM64, non-writeable segments do not contain relocations, so we can write
|
||||
// them in their entirety without modification. Note that the size in memory of the
|
||||
// section may be larger than the size on disk, but again, the additional size in memory
|
||||
// is not part of our FIPS boundary
|
||||
// For now we assume that if there are writable segments before the final writable
|
||||
// segment that they also contain no relocations
|
||||
SymCryptHmacSha256Append( &hmacState, (PCBYTE) segmentStart,
|
||||
programHeader->p_filesz );
|
||||
}
|
||||
}
|
||||
|
||||
SymCryptHmacSha256Result( &hmacState, actualDigest );
|
||||
|
@ -307,32 +527,32 @@ VOID SymCryptModuleVerifyIntegrity(void)
|
|||
{
|
||||
// Verify that our placeholder values were modified after compile time. The build script
|
||||
// should have replaced the placeholder values with their expected values
|
||||
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
|
||||
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );
|
||||
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
|
||||
SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );
|
||||
|
||||
const Elf64_Addr module_base = (Elf64_Addr) SymCryptVolatileFipsHmacKey -
|
||||
SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva );
|
||||
const Elf_Addr module_base = (Elf_Addr) SymCryptVolatileFipsHmacKey -
|
||||
SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva );
|
||||
|
||||
const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
|
||||
const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
|
||||
SYMCRYPT_FIPS_ASSERT( memcmp(header->e_ident.ident.magic, ElfMagic, sizeof(ElfMagic)) == 0 );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_type == ET_DYN );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 || header->e_machine == EM_ARM );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_version == EV_CURRENT );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf64_Ehdr) );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf64_Phdr) );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf_Ehdr) );
|
||||
SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf_Phdr) );
|
||||
|
||||
const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
|
||||
const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );
|
||||
|
||||
Elf64_Rela_Info relaInfo = {};
|
||||
Elf_Rela_Info relaInfo = {};
|
||||
|
||||
Elf64_Dyn* dynStart = NULL;
|
||||
Elf_Dyn* dynStart = NULL;
|
||||
|
||||
for( unsigned int i = 0; i < header->e_phnum; ++i )
|
||||
{
|
||||
const Elf64_Phdr* programHeader = programHeaderStart + i;
|
||||
const Elf_Phdr* programHeader = programHeaderStart + i;
|
||||
if( programHeader->p_type == PT_DYNAMIC )
|
||||
{
|
||||
dynStart = (Elf64_Dyn*) (module_base + (Elf64_Off) programHeader->p_vaddr);
|
||||
dynStart = (Elf_Dyn*) (module_base + (Elf_Off) programHeader->p_vaddr);
|
||||
|
||||
SymCryptModuleFindRelocationInfo( dynStart, &relaInfo );
|
||||
|
||||
|
|
|
@ -1,21 +1,22 @@
|
|||
//
|
||||
// integrity.h
|
||||
// FIPS 140-3 integrity verification header for ELF binaries
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
VOID SymCryptModuleVerifyIntegrity(void);
|
||||
//
|
||||
// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
|
||||
// HMAC-SHA256. The module must have been postprocessed after compilation using
|
||||
// process_fips_module.py. The integrity check finds the module's base address in memory by
|
||||
// subtracting the relative virtual address of a known variable from its actual address in memory.
|
||||
// It then uses the ELF header to find all the loadable segments in the module and calculate the
|
||||
// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
|
||||
// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
|
||||
// match the contents of the file on disk prior to relocation.
|
||||
//
|
||||
// If the integrity check fails for any reason, the module will fastfail, crashing the process,
|
||||
// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
|
||||
//
|
||||
//
|
||||
// integrity.h
|
||||
// FIPS 140-3 integrity verification header for ELF binaries
|
||||
//
|
||||
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
|
||||
//
|
||||
|
||||
VOID SymCryptModuleVerifyIntegrity(void);
|
||||
//
|
||||
// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
|
||||
// HMAC-SHA256. The module must have been postprocessed after compilation using
|
||||
// process_fips_module.py. The integrity check finds the module's base address in memory by
|
||||
// subtracting the relative virtual address of a known variable from its actual address in memory.
|
||||
// It then uses the ELF header to find all the loadable segments in the module and calculate the
|
||||
// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
|
||||
// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
|
||||
// match the contents of the file on disk prior to relocation.
|
||||
//
|
||||
// If the integrity check fails for any reason, the module will fastfail, crashing the process,
|
||||
// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
|
||||
//
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@ set(SOURCES
|
|||
../common/optional/rngforkdetection.c
|
||||
../common/optional/rngsecureurandom.c)
|
||||
|
||||
# Enable integrity verification if compiling for AMD64 or ARM64
|
||||
if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM64")
|
||||
# Enable integrity verification if compiling for AMD64 or ARM64 or ARM
|
||||
if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM")
|
||||
list(APPEND SOURCES ../common/integrity.c)
|
||||
else()
|
||||
list(APPEND SOURCES ../common/nointegrity.c)
|
||||
|
|
|
@ -15,7 +15,7 @@ import subprocess
|
|||
import sys
|
||||
from typing import List
|
||||
|
||||
ARCH_CMAKE = ("x86", "amd64", "arm64")
|
||||
ARCH_CMAKE = ("x86", "amd64", "arm64", "arm")
|
||||
CONFIG_CMAKE = ("Debug", "Release", "Sanitize")
|
||||
|
||||
ARCH_MSBUILD = ("x86", "amd64", "arm64")
|
||||
|
@ -35,8 +35,8 @@ def get_normalized_host_arch() -> str:
|
|||
normalized_arch = "amd64"
|
||||
elif re.fullmatch("ARM64|aarch64", host_arch):
|
||||
normalized_arch = "arm64"
|
||||
|
||||
# No support for ARM32 right now
|
||||
elif re.fullmatch("ARM32|aarch32", host_arch):
|
||||
normalized_arch = "arm"
|
||||
|
||||
if not normalized_arch:
|
||||
print("Unrecognized host architecture " + host_arch, file = sys.stderr)
|
||||
|
@ -82,7 +82,8 @@ def configure_cmake(args : argparse.Namespace) -> None:
|
|||
cmake_args.append("x64")
|
||||
elif args.arch == "arm64":
|
||||
cmake_args.append("arm64")
|
||||
|
||||
elif args.arch == "arm":
|
||||
cmake_args.append("arm")
|
||||
# No support for ARM32 right now
|
||||
|
||||
if args.host_arch != args.arch:
|
||||
|
|
|
@ -197,7 +197,7 @@ def main() -> None:
|
|||
|
||||
parser = argparse.ArgumentParser(description = "Packaging helper script for SymCrypt.")
|
||||
parser.add_argument("build_dir", type = pathlib.Path, help = "Build output directory.")
|
||||
parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64"))
|
||||
parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64", "arm"))
|
||||
parser.add_argument("config", type = str, help = "Build configuration.", choices = ["Debug", "Release", "Sanitize"])
|
||||
parser.add_argument("module_name", type = str, help = "Name of the module to package.")
|
||||
parser.add_argument("release_dir", type = pathlib.Path, help = "Directory to place the release in.")
|
||||
|
|
|
@ -21,7 +21,7 @@ import struct
|
|||
|
||||
from elftools.elf.constants import P_FLAGS
|
||||
from elftools.elf.elffile import ELFFile
|
||||
from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64
|
||||
from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64, ENUM_RELOC_TYPE_ARM
|
||||
|
||||
# Names of global constants in the FIPS module that need to be replaced
|
||||
KEY_NAME = "SymCryptVolatileFipsHmacKey"
|
||||
|
@ -33,9 +33,27 @@ DIGEST_NAME = "SymCryptVolatileFipsHmacDigest"
|
|||
CHAR_FORMAT_SPECIFIER = "s"
|
||||
QWORD_FORMAT_SPECIFIER = "Q"
|
||||
QWORD_BYTE_SIZE = struct.calcsize(QWORD_FORMAT_SPECIFIER)
|
||||
DWORD_FORMAT_SPECIFIER = "I"
|
||||
DWORD_BYTE_SIZE = struct.calcsize(DWORD_FORMAT_SPECIFIER)
|
||||
|
||||
RELOCATION_TYPE_SIZES = {
|
||||
ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]: {
|
||||
'size': QWORD_BYTE_SIZE,
|
||||
'format': QWORD_FORMAT_SPECIFIER,
|
||||
},
|
||||
ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]: {
|
||||
'size': QWORD_BYTE_SIZE,
|
||||
'format': QWORD_FORMAT_SPECIFIER,
|
||||
},
|
||||
ENUM_RELOC_TYPE_ARM["R_ARM_JUMP_SLOT"]: {
|
||||
'size': DWORD_BYTE_SIZE,
|
||||
'format': DWORD_FORMAT_SPECIFIER,
|
||||
},
|
||||
}
|
||||
|
||||
# Must match the placeholder values in integrity.c
|
||||
PLACEHOLDER_VALUE = struct.pack(QWORD_FORMAT_SPECIFIER, 0x8BADF00D)
|
||||
PLACEHOLDER_VALUE_64BIT = struct.pack(QWORD_FORMAT_SPECIFIER, 0x4BADF00D8BADF00D)
|
||||
PLACEHOLDER_VALUE_32BIT = struct.pack(DWORD_FORMAT_SPECIFIER, 0x8BADF00D)
|
||||
PLACEHOLDER_ARRAY = bytes((
|
||||
0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,
|
||||
0x08, 0x4E, 0x3F, 0xE7, 0x60, 0x7E, 0x4F, 0x08,
|
||||
|
@ -136,7 +154,8 @@ class ElfFileValueProxy(object):
|
|||
# The .data() method returns a bytes object. We can't use it to write back to the original
|
||||
# buffer, so we need to find the appropriate section within the stream using sh_offset and
|
||||
# write to that.
|
||||
logging.debug("Writing {} to offset {}".format(value.hex(), hex(self.offset)))
|
||||
# Note self.section.stream == self.elf_file.stream.
|
||||
logging.debug("Changing {} writing {} to offset {}".format(self.name, value.hex(), hex(self.offset)))
|
||||
self.section.stream.seek(self.offset)
|
||||
self.section.stream.write(value)
|
||||
|
||||
|
@ -145,7 +164,7 @@ class ElfFileValueProxy(object):
|
|||
assert(len(new_value) == self.length)
|
||||
|
||||
if self.name is not None:
|
||||
logging.debug("Changing {} value".format(self.name))
|
||||
logging.debug("Changing {} value to {}".format(self.name, *args))
|
||||
|
||||
self.value = new_value
|
||||
|
||||
|
@ -156,6 +175,36 @@ def log_value(var):
|
|||
hex(var.vaddr),
|
||||
var.value.hex()))
|
||||
|
||||
def dbg_dump_hex(data, address=0, file=None):
|
||||
digits = "0123456789abcdef"
|
||||
char_per_line = 16
|
||||
remaining = len(data)
|
||||
for line_pos in range(0, len(data), char_per_line):
|
||||
chars_line = char_per_line if remaining > char_per_line else remaining
|
||||
line = "{:08x} ".format(address)
|
||||
address += chars_line
|
||||
|
||||
for i in range(char_per_line):
|
||||
if i < chars_line:
|
||||
line += "{:02x}".format(data[line_pos + i])
|
||||
if i == 7:
|
||||
line += ":"
|
||||
else:
|
||||
line += " "
|
||||
else:
|
||||
line += " "
|
||||
|
||||
line += " "
|
||||
|
||||
for i in range(chars_line):
|
||||
if data[line_pos + i] < 32 or data[line_pos + i] > 126:
|
||||
line += "."
|
||||
else:
|
||||
line += chr(data[line_pos + i])
|
||||
print(line, file=file)
|
||||
remaining -= char_per_line
|
||||
|
||||
|
||||
def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_path = None):
|
||||
"""
|
||||
Performs HMAC-SHA256 on module contents and writes it back to the module buffer
|
||||
|
@ -164,6 +213,10 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
|
|||
module_bytes = bytearray()
|
||||
last_segment_offset = -1
|
||||
|
||||
dump_file = None
|
||||
if dump_file_path is not None:
|
||||
dump_file = open(dump_file_path + '.txt', "w")
|
||||
|
||||
for (index, segment) in enumerate(loadable_segments):
|
||||
|
||||
segment_hashable_length = 0
|
||||
|
@ -175,9 +228,13 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
|
|||
if segment["p_offset"] + segment["p_filesz"] > data_section_offset:
|
||||
segment_hashable_length = data_section_offset - segment["p_offset"]
|
||||
module_bytes += segment.data()[:segment_hashable_length]
|
||||
print("\nHMAC append: off {:08x} past data segment size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
|
||||
else:
|
||||
module_bytes += segment.data()
|
||||
segment_hashable_length = len(segment.data())
|
||||
print("\nHMAC append: off {:08x} normal size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
|
||||
|
||||
dbg_dump_hex(segment.data()[:segment_hashable_length], file=dump_file)
|
||||
|
||||
logging.info("Segment {}: {} - {}".format(
|
||||
index,
|
||||
|
@ -186,7 +243,8 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
|
|||
|
||||
last_segment_offset = segment["p_offset"]
|
||||
|
||||
if dump_file_path is not None:
|
||||
if dump_file is not None:
|
||||
dump_file.close()
|
||||
with open(dump_file_path, "wb") as dump_file:
|
||||
dump_file.write(module_bytes)
|
||||
|
||||
|
@ -265,29 +323,34 @@ def overwrite_jump_slots(elf_file, new_value):
|
|||
(vaddr, original value), so that they can be reset after the HMAC digest is calculated.
|
||||
"""
|
||||
|
||||
rela_plt_section = elf_file.get_section_by_name(".rela.plt")
|
||||
|
||||
# Jump slot relocations always live in the .rela.plt section. If there is no .rela.plt section,
|
||||
# there will be no jump slot relocations
|
||||
if rela_plt_section is None:
|
||||
return []
|
||||
|
||||
original_jump_slot_values = []
|
||||
for relocation in rela_plt_section.iter_relocations():
|
||||
|
||||
# Jump slot relocations live in .rela.plt or .rel.plt sections.
|
||||
rela_plt_section = elf_file.get_section_by_name(".rela.plt")
|
||||
rel_plt_section = elf_file.get_section_by_name(".rel.plt")
|
||||
relocations = []
|
||||
if rela_plt_section is not None:
|
||||
relocations += rela_plt_section.iter_relocations()
|
||||
if rel_plt_section is not None:
|
||||
relocations += rel_plt_section.iter_relocations()
|
||||
|
||||
for relocation in relocations:
|
||||
relocation_type = relocation["r_info_type"]
|
||||
if (relocation_type == ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]
|
||||
or relocation_type == ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]):
|
||||
|
||||
logging.debug("Found relocation type {}".format(relocation_type))
|
||||
if relocation_type in RELOCATION_TYPE_SIZES:
|
||||
# Note that r_offset is actually a virtual address
|
||||
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"], QWORD_BYTE_SIZE)
|
||||
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"],
|
||||
RELOCATION_TYPE_SIZES[relocation_type]["size"])
|
||||
|
||||
original_value_int = struct.unpack(QWORD_FORMAT_SPECIFIER, relocation_value.value)[0]
|
||||
original_jump_slot_values.append((relocation_value.vaddr, original_value_int))
|
||||
original_value_int = struct.unpack(RELOCATION_TYPE_SIZES[relocation_type]["format"], relocation_value.value)[0]
|
||||
original_jump_slot_values.append((relocation_value.vaddr, original_value_int, relocation_type))
|
||||
|
||||
logging.debug("Updating relocation at {} with original value {}".format(
|
||||
hex(relocation_value.offset), hex(original_value_int)))
|
||||
|
||||
relocation_value.set_value(QWORD_FORMAT_SPECIFIER, new_value)
|
||||
relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], new_value)
|
||||
else:
|
||||
logging.warning("Unknown relocation type {} found at offset {}".format(
|
||||
relocation_type, relocation["r_offset"]))
|
||||
|
@ -301,13 +364,14 @@ def reset_jump_slots(elf_file, original_jump_slot_values):
|
|||
lazy binding still works.
|
||||
"""
|
||||
|
||||
for vaddr, original_value in original_jump_slot_values:
|
||||
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr, QWORD_BYTE_SIZE)
|
||||
for vaddr, original_value, relocation_type in original_jump_slot_values:
|
||||
relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr,
|
||||
RELOCATION_TYPE_SIZES[relocation_type]["size"])
|
||||
|
||||
logging.debug("Resetting relocation at {} to original value {}".format(
|
||||
hex(vaddr), hex(original_value)))
|
||||
|
||||
relocation_value.set_value(QWORD_FORMAT_SPECIFIER, original_value)
|
||||
relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], original_value)
|
||||
|
||||
def main():
|
||||
"""
|
||||
|
@ -335,7 +399,7 @@ def main():
|
|||
elf_file = ELFFile(buffer_stream)
|
||||
|
||||
arch = elf_file["e_machine"]
|
||||
if not arch == "EM_X86_64" and not arch == "EM_AARCH64":
|
||||
if not arch == "EM_X86_64" and not arch == "EM_AARCH64" and not arch == "EM_ARM":
|
||||
logging.error("Unsupported architecture {}".format(arch))
|
||||
raise RuntimeError
|
||||
|
||||
|
@ -354,20 +418,32 @@ def main():
|
|||
# Find the HMAC key relative virtual address placeholder and replace it with
|
||||
# the actual relative virtual address of the HMAC key
|
||||
key_rva_variable = ElfFileValueProxy.from_symbol_name(elf_file, KEY_RVA_NAME)
|
||||
assert(key_rva_variable.value == PLACEHOLDER_VALUE)
|
||||
if arch == "EM_ARM":
|
||||
assert(key_rva_variable.value == PLACEHOLDER_VALUE_32BIT)
|
||||
else:
|
||||
assert(key_rva_variable.value == PLACEHOLDER_VALUE_64BIT)
|
||||
|
||||
key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
|
||||
if arch == "EM_ARM":
|
||||
key_rva_variable.set_value(DWORD_FORMAT_SPECIFIER, key_variable.vaddr)
|
||||
else:
|
||||
key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
|
||||
log_value(key_rva_variable)
|
||||
|
||||
# Find the FIPS module boundary placeholder and replace it with the our actual
|
||||
# FIPS module boundary, which we have defined to be the start of the .data section
|
||||
fips_boundary_variable = ElfFileValueProxy.from_symbol_name(elf_file, BOUNDARY_OFFSET_NAME)
|
||||
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE)
|
||||
if arch == "EM_ARM":
|
||||
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_32BIT)
|
||||
else:
|
||||
assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_64BIT)
|
||||
|
||||
data_section = elf_file.get_section_by_name(".data")
|
||||
data_section_offset = data_section["sh_offset"]
|
||||
|
||||
fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
|
||||
if arch == "EM_ARM":
|
||||
fips_boundary_variable.set_value(DWORD_FORMAT_SPECIFIER, data_section_offset)
|
||||
else:
|
||||
fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
|
||||
log_value(fips_boundary_variable)
|
||||
|
||||
# Find the HMAC digest placeholder, HMAC the loadable segments of the module, and replace
|
||||
|
|
|
@ -20,9 +20,9 @@ appropriate to enable this effort.
|
|||
Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
|
||||
this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
|
||||
file. If the processed symcryptasm file includes other files via the INCLUDE directive, the contents
|
||||
of the included files are merged at their point of inclusion to generate a single expanded symcryptasm
|
||||
file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which
|
||||
do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to
|
||||
of the included files are merged at their point of inclusion to generate a single expanded symcryptasm
|
||||
file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which
|
||||
do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to
|
||||
the source file.
|
||||
|
||||
The .cppasm files are further processed by the C preprocessor to do more simple stateless text
|
||||
|
@ -78,7 +78,7 @@ A leaf function ends with the FUNCTION_END macro, which also takes the function
|
|||
|
||||
At the function start a prologue is generated which arranges the arguments appropriately in
|
||||
registers, and saves non-volatile registers that have been requested to be used.
|
||||
At the function end an epilogue is generated with restores the non-volatile registers and returns.
|
||||
At the function end an epilogue is generated which restores the non-volatile registers and returns.
|
||||
|
||||
|
||||
A nested function (a function which does call another function) is specified similarly, only using
|
||||
|
@ -112,14 +112,14 @@ prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this ca
|
|||
We currently do not support nested mul functions, as we have none of them.
|
||||
|
||||
Stack layout for amd64 is as follows. Xmm registers are volatile and not saved on Linux.
|
||||
|
||||
|
||||
Memory Exists if
|
||||
|-------------------|
|
||||
| |
|
||||
| Shadow space |
|
||||
| |
|
||||
| Shadow space |
|
||||
| |
|
||||
|-------------------|
|
||||
| Return address |
|
||||
| Return address |
|
||||
|-------------------|
|
||||
| Non-volatile |
|
||||
| general purpose | reg_count > volatile_registers
|
||||
|
@ -149,6 +149,11 @@ currently support, these registers are volatile so do not need any special handl
|
|||
X_0 is always the result register and the first argument passed to the function.
|
||||
X_1-X_7 are the arguments 2-8 passed to the function
|
||||
|
||||
|
||||
### arm (32) ###
|
||||
We allow the registers r0-r12 to be addressed as r13-r15 are special registers that we cannot use as general purpose registers.
|
||||
As r12 is volatile in a leaf function, it should be used in preference to r4, to avoid spilling/restoring a register.
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
@ -216,6 +221,24 @@ ARM64_R28 = Register("x28", "w28")
|
|||
ARM64_R29 = Register("x29", "w29") # Frame Pointer
|
||||
ARM64_R30 = Register("x30", "w30") # Link Register
|
||||
|
||||
# arm32 registers
|
||||
ARM32_R0 = Register(None, "r0")
|
||||
ARM32_R1 = Register(None, "r1")
|
||||
ARM32_R2 = Register(None, "r2")
|
||||
ARM32_R3 = Register(None, "r3")
|
||||
ARM32_R4 = Register(None, "r4")
|
||||
ARM32_R5 = Register(None, "r5")
|
||||
ARM32_R6 = Register(None, "r6")
|
||||
ARM32_R7 = Register(None, "r7")
|
||||
ARM32_R8 = Register(None, "r8")
|
||||
ARM32_R9 = Register(None, "r9")
|
||||
ARM32_R10 = Register(None, "r10")
|
||||
ARM32_R11 = Register(None, "r11")
|
||||
ARM32_R12 = Register(None, "r12")
|
||||
ARM32_R13 = Register(None, "r13")
|
||||
ARM32_R14 = Register(None, "r14")
|
||||
ARM32_R15 = Register(None, "r15")
|
||||
|
||||
class CallingConvention:
|
||||
"""A class to represent calling conventions"""
|
||||
|
||||
|
@ -306,7 +329,7 @@ def calc_amd64_stack_allocation_sizes(self, reg_count, stack_alloc_size, xmm_reg
|
|||
aligned_on_16B = True
|
||||
|
||||
# Calculate the space needed to save Xmm registers on the stack
|
||||
saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)
|
||||
saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)
|
||||
xmm_save_size = 16 * saved_reg_xmm
|
||||
if xmm_save_size > 0 and not aligned_on_16B:
|
||||
xmm_save_size += 8
|
||||
|
@ -337,7 +360,7 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_re
|
|||
|
||||
prologue = "\n"
|
||||
|
||||
# Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space.
|
||||
# Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space.
|
||||
# Each of the sections other than general purpose registers are aligned on 16B boundary and some of them
|
||||
# may include an 8B padding in their size.
|
||||
reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
|
||||
|
@ -380,12 +403,12 @@ def gen_prologue_amd64_msft_nested(self, arg_count, reg_count, stack_alloc_size,
|
|||
return gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)
|
||||
|
||||
def gen_epilogue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
|
||||
|
||||
|
||||
epilogue = "\n"
|
||||
|
||||
reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
|
||||
self, reg_count, stack_alloc_size, xmm_reg_count, nested)
|
||||
|
||||
|
||||
# Restore non-volatile Xmm registers
|
||||
if xmm_save_size > 0:
|
||||
for i in range(6, xmm_reg_count):
|
||||
|
@ -454,7 +477,7 @@ MAPPING_AMD64_SYSTEMV = {
|
|||
}
|
||||
|
||||
def gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = False):
|
||||
|
||||
|
||||
# Calculate the sizes required for each section
|
||||
# We need to call with xmm_reg_count=0 to avoid allocation/alignment for saving Xmm registers since they're
|
||||
# volatile for this calling convention.
|
||||
|
@ -491,7 +514,7 @@ def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count, stack_alloc_si
|
|||
return gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)
|
||||
|
||||
def gen_epilogue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
|
||||
|
||||
|
||||
epilogue = ""
|
||||
|
||||
# Calculate the sizes required for each section
|
||||
|
@ -601,6 +624,27 @@ MAPPING_ARM64_ARM64ECMSFT = {
|
|||
# R28 is reserved in ARM64EC
|
||||
}
|
||||
|
||||
# ARM32 calling convention
|
||||
# A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP (and r9 in PCS variants that designate r9 as v6).
|
||||
MAPPING_ARM32_AAPCS32 = {
|
||||
0: ARM32_R0, # Argument 1 / Result register / volatile
|
||||
1: ARM32_R1, # Argument 2 / Result register / volatile
|
||||
2: ARM32_R2, # Argument 3 / volatile
|
||||
3: ARM32_R3, # Argument 4 / volatile
|
||||
4: ARM32_R4, # non-volatile
|
||||
5: ARM32_R5, # non-volatile
|
||||
6: ARM32_R6, # non-volatile
|
||||
7: ARM32_R7, # non-volatile
|
||||
8: ARM32_R8, # non-volatile
|
||||
9: ARM32_R9, # reserved for something
|
||||
10:ARM32_R10, # non-volatile
|
||||
11:ARM32_R11, # FP non-volatile
|
||||
12:ARM32_R12, # volatile for leaf functions
|
||||
13:ARM32_R13, # SP
|
||||
14:ARM32_R14, # LR
|
||||
15:ARM32_R15, # PC
|
||||
}
|
||||
|
||||
def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||
prologue = ""
|
||||
|
||||
|
@ -621,6 +665,38 @@ def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c
|
|||
|
||||
return epilogue
|
||||
|
||||
def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||
assert(not stack_alloc_size and not xmm_reg_count)
|
||||
prologue = ""
|
||||
# Always spill at least 1 register (LR).
|
||||
# LR needs to be saved for nested functions but for now we'll store it always
|
||||
# since we don't differentiate between nested and leaf functions for arm yet.
|
||||
registers_to_spill = []
|
||||
logging.info(f"prologue {reg_count} > {self.volatile_registers}")
|
||||
if reg_count > self.volatile_registers:
|
||||
for i in range(self.volatile_registers, reg_count):
|
||||
registers_to_spill.append('r%s' % i)
|
||||
# Stack pointer is word 4B aligned
|
||||
# required_stack_space = 4 * len(registers_to_spill)
|
||||
registers_to_spill.append('lr')
|
||||
prologue += "push {" + ",".join(registers_to_spill) + "}\n"
|
||||
return prologue
|
||||
|
||||
def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||
assert(not stack_alloc_size and not xmm_reg_count)
|
||||
epilogue = ""
|
||||
|
||||
registers_to_spill = []
|
||||
logging.info(f"epilogue {reg_count} > {self.volatile_registers}")
|
||||
if reg_count > self.volatile_registers:
|
||||
for i in range(self.volatile_registers, reg_count):
|
||||
registers_to_spill.append('r%s' % i)
|
||||
# Stack pointer is word 4B aligned
|
||||
# required_stack_space = 4 * len(registers_to_spill)
|
||||
registers_to_spill.append('pc')
|
||||
epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
|
||||
return epilogue
|
||||
|
||||
def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
|
||||
prologue = ""
|
||||
|
||||
|
@ -677,6 +753,10 @@ CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
|
|||
"arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
|
||||
gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)
|
||||
|
||||
CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
|
||||
"arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
|
||||
gen_prologue_aapcs32, gen_epilogue_aapcs32, gen_get_memslot_offset_arm64)
|
||||
|
||||
def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
|
||||
defines = ""
|
||||
if architecture == "amd64":
|
||||
|
@ -687,6 +767,8 @@ def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True
|
|||
elif architecture == "arm64":
|
||||
prefix64 = "X_"
|
||||
prefix32 = "W_"
|
||||
elif architecture == "arm32":
|
||||
return defines
|
||||
else:
|
||||
logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
|
||||
exit(1)
|
||||
|
@ -736,8 +818,8 @@ MASM_FUNCTION_TEMPLATE = "%s, _TEXT\n"
|
|||
# ARMASM64 function macros must be correctly indented
|
||||
ARMASM64_FUNCTION_TEMPLATE = " %s\n"
|
||||
|
||||
GAS_FUNCTION_ENTRY = "%s: .global %s\n"
|
||||
GAS_FUNCTION_END = ""
|
||||
GAS_FUNCTION_ENTRY = "%s: .global %s\n.type %s, %%function\n// .func %s\n"
|
||||
GAS_FUNCTION_END = "// .endfunc // %s"
|
||||
|
||||
def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested):
|
||||
function_entry = None
|
||||
|
@ -756,7 +838,7 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r
|
|||
elif assembler == "armasm64":
|
||||
function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
|
||||
elif assembler == "gas":
|
||||
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
|
||||
function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name, function_name, function_name)
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
|
||||
exit(1)
|
||||
|
@ -784,7 +866,7 @@ def generate_epilogue(assembler, calling_convention, function_name, arg_count, r
|
|||
elif assembler == "armasm64":
|
||||
function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
|
||||
elif assembler == "gas":
|
||||
function_end = GAS_FUNCTION_END
|
||||
function_end = GAS_FUNCTION_END % function_name
|
||||
else:
|
||||
logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
|
||||
exit(1)
|
||||
|
@ -928,11 +1010,11 @@ class ProcessingStateMachine:
|
|||
else:
|
||||
self.calling_convention = self.normal_calling_convention
|
||||
|
||||
return generate_prologue(self.assembler,
|
||||
self.calling_convention,
|
||||
self.function_name,
|
||||
self.arg_count,
|
||||
self.reg_count,
|
||||
return generate_prologue(self.assembler,
|
||||
self.calling_convention,
|
||||
self.function_name,
|
||||
self.arg_count,
|
||||
self.reg_count,
|
||||
self.stack_alloc_size,
|
||||
self.xmm_reg_count,
|
||||
self.is_nested_function
|
||||
|
@ -1115,6 +1197,10 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
|
|||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
||||
mul_calling_convention = None
|
||||
nested_calling_convention = None
|
||||
elif architecture == "arm" and calling_convention == "aapcs32":
|
||||
normal_calling_convention = CALLING_CONVENTION_ARM32_AAPCS32
|
||||
mul_calling_convention = None
|
||||
nested_calling_convention = None
|
||||
elif assembler == "armasm64":
|
||||
if architecture == "arm64" and calling_convention == "aapcs64":
|
||||
normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
|
||||
|
@ -1149,7 +1235,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
|
|||
|
||||
# expand_files() is called recursively when a .symcryptasm file contains an INCLUDE directive,
|
||||
# except for the first call here where we're starting to process the input source file
|
||||
# as if it was included by some other file.
|
||||
# as if it was included by some other file.
|
||||
expanded_file, infile_has_includes = expand_files(infilename, 0, "")
|
||||
expanded_lines.extend(expanded_file)
|
||||
|
||||
|
@ -1163,17 +1249,20 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
|
|||
with open(outfilename, "w") as outfile:
|
||||
for line_num, line in enumerate(expanded_lines):
|
||||
processed_line = file_processing_state.process_line(line, line_num)
|
||||
# logging.info("processed line: %s" % processed_line)
|
||||
outfile.write(processed_line)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
|
||||
parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
|
||||
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
|
||||
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
|
||||
parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64', 'arm'])
|
||||
parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec', 'aapcs32'])
|
||||
parser.add_argument('inputfile', type=str, help='Path to input file')
|
||||
parser.add_argument('outputfile', type=str, help='Path to output file')
|
||||
|
||||
args = parser.parse_args()
|
||||
process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)
|
||||
logging.info("Done")
|
|
@ -18,6 +18,6 @@ target_link_libraries(symcryptunittest symcryptunittest_lib symcrypt_common atom
|
|||
|
||||
# Export oe_sgx_get_additional_host_entropy from the executable so that if we dynamically load
|
||||
# oe module, the linker can find the version which is locally defined in the executable
|
||||
target_link_options(symcryptunittest PRIVATE
|
||||
target_link_options(symcryptunittest PRIVATE
|
||||
-Wl,--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/dynamic-list.ver
|
||||
)
|
||||
|
|
|
@ -97,5 +97,11 @@ else()
|
|||
add_compile_options(-DINCLUDE_IMPL_RSA32=0)
|
||||
endif()
|
||||
|
||||
if(SYMCRYPT_TARGET_ARCH STREQUAL "ARM" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
# Hide warning due to abi change.
|
||||
set_source_files_properties(kat.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
|
||||
set_source_files_properties(perf.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
|
||||
endif()
|
||||
|
||||
add_library(symcryptunittest_lib STATIC ${SOURCES})
|
||||
set_target_properties(symcryptunittest_lib PROPERTIES PREFIX "")
|
||||
|
|
Загрузка…
Ссылка в новой задаче