Merged PR 9576163: Build SymCrypt with gcc-arm-linux-gnueabihf

2023-10-13 19:26:47 +00:00 · 2023-10-13 19:26:47 +00:00 · b6a267815e
--- a/.pipelines/OneBranch.PullRequest.yml
+++ b/.pipelines/OneBranch.PullRequest.yml
@ -180,4 +180,18 @@ extends:
          config: 'Release'
          cc: 'clang'
          cxx: 'clang++'
-          additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
+          additionalArgs: '--toolchain=cmake-configs/Toolchain-Clang-ARM64.cmake'
+      - template: .pipelines/templates/build-linux.yml@self
+        parameters:
+          arch: 'ARM'
+          config: 'Debug'
+          cc: 'gcc'
+          cxx: 'g++'
+          additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'
+      - template: .pipelines/templates/build-linux.yml@self
+        parameters:
+          arch: 'ARM'
+          config: 'Release'
+          cc: 'gcc'
+          cxx: 'g++'
+          additionalArgs: '--toolchain=cmake-configs/Toolchain-GCC-ARM.cmake'
--- a/.pipelines/templates/build-linux.yml
+++ b/.pipelines/templates/build-linux.yml
@ -74,6 +74,12 @@ jobs:
          apt-get install -y binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu qemu-user
        displayName: 'Install arm64 cross-compilation tools'

+    - ${{ if eq(parameters.arch, 'ARM') }}:
+      - script: |
+          apt-get update
+          apt-get install -y gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf qemu-user
+        displayName: 'Install arm64 cross-compilation tools'
+
    - task: PipAuthenticate@1
      inputs:
        artifactFeeds: 'OS/SymCrypt_PublicPackages'
@ -92,23 +98,24 @@ jobs:

    - ${{ if ne(parameters.skipTests, true) }}:
      - ${{ if ne(parameters.arch, 'ARM64') }}:
-        - task: PythonScript@0
-          displayName: 'Run unit tests'
-          inputs:
-            scriptSource: 'filePath'
-            scriptPath: scripts/test.py
-            arguments: 'bin noperftests'
-            workingDirectory: $(Build.SourcesDirectory)
-
-        - ${{ if ne(parameters.config, 'Sanitize') }}:
+        - ${{ if ne(parameters.arch, 'ARM') }}:
          - task: PythonScript@0
-            displayName: 'Run dynamic unit tests'
+            displayName: 'Run unit tests'
            inputs:
              scriptSource: 'filePath'
              scriptPath: scripts/test.py
-              arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
+              arguments: 'bin noperftests'
              workingDirectory: $(Build.SourcesDirectory)

+          - ${{ if ne(parameters.config, 'Sanitize') }}:
+            - task: PythonScript@0
+              displayName: 'Run dynamic unit tests'
+              inputs:
+                scriptSource: 'filePath'
+                scriptPath: scripts/test.py
+                arguments: 'bin dynamic:bin/module/generic/libsymcrypt.so noperftests'
+                workingDirectory: $(Build.SourcesDirectory)
+
      - ${{ if eq(parameters.arch, 'AMD64') }}:
        - task: PythonScript@0
          displayName: 'Run unit tests (test YMM save/restore)'
@ -134,6 +141,23 @@ jobs:
            scriptPath: scripts/test.py
            arguments: '--emulator qemu-aarch64 --emulator-lib-dir /usr/aarch64-linux-gnu/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
            workingDirectory: $(Build.SourcesDirectory)
+      
+      - ${{ if eq(parameters.arch, 'ARM') }}:
+        - task: PythonScript@0
+          displayName: 'Run unit tests'
+          inputs:
+            scriptSource: 'filePath'
+            scriptPath: scripts/test.py
+            arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin noperftests +symcrypt -dh -dsa -rsa'
+            workingDirectory: $(Build.SourcesDirectory)
+
+        - task: PythonScript@0
+          displayName: 'Run dynamic unit tests'
+          inputs:
+            scriptSource: 'filePath'
+            scriptPath: scripts/test.py
+            arguments: '--emulator qemu-arm --emulator-lib-dir /usr/arm-linux-gnueabihf/ bin dynamic:bin/module/generic/libsymcrypt.so noperftests +symcrypt -dh -dsa -rsa'
+            workingDirectory: $(Build.SourcesDirectory)

    - task: PythonScript@0
      displayName: 'Package build output'
--- a/BUILD.md
+++ b/BUILD.md
@ -79,4 +79,25 @@ Building the SymCrypt unit tests with MSBuild requires access to the msbignum an
 be released externally due to licensing restrictions.  If you wish to build directly with MSBuild, bypassing the Python
 helper script, you can run `msbuild /p:Platform=<platform> /p:Architecture=<arch> symcrypt.sln`. Note that Python is
 still required for translating SymCryptAsm. The output directory for MSBuild is always `build\bin`, and all compiled
-outputs are placed in this directory.
+outputs are placed in this directory.
+
+## Building Linux targets
+
+Requires the following packages on debian-based systems to build:
+```
+apt-get -y install --no-install-recommends \
+    cmake \
+    python3-pyelftools \                             # integrity
+    gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf  # for arm
+```
+
+And for running the test:
+```
+apt-get -y install --no-install-recommends qemu-user
+```
+
+To build and test for example for arm:
+```
+python3 scripts/build.py cmake --arch arm --toolchain cmake-configs/Toolchain-GCC-ARM.cmake bin_arm
+qemu-arm -L /usr/arm-linux-gnueabihf/ ./bin_arm/exe/symcryptunittest -rsa -dsa -dh -ec -int -mod dynamic:bin_arm/module/generic/libsymcrypt.so
+```
--- a/cmake-configs/SymCrypt-Platforms.cmake
+++ b/cmake-configs/SymCrypt-Platforms.cmake
@ -52,6 +52,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
        # GCC complains about implicit casting between ASIMD registers (i.e. uint8x16_t -> uint64x2_t) by default,
        # whereas clang and MSVC do not. Setting -flax-vector-conversions to build Arm64 intrinsics code with GCC.
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -flax-vector-conversions")
+    elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a+neon-vfpv4 -flax-vector-conversions -mfpu=neon")
    endif()
    
    # add_compile_options(-Wall)
--- a/cmake-configs/Toolchain-GCC-ARM.cmake
+++ b/cmake-configs/Toolchain-GCC-ARM.cmake
@ -0,0 +1,22 @@
+# This toolchain file configures ARM cross-compilation with GCC.
+# To use the toolchain file, run cmake .. --toolchain="../cmake-configs/Toolchain-GCC-ARM.cmake"
+# Note: the --toolchain argument is only available in CMake v3.21 and newer. Prior to that version, you will have to
+# specify -DCMAKE_TOOLCHAIN_FILE instead, which may cause a spurious warning about an unused variable.
+
+set(CMAKE_SYSTEM_PROCESSOR ARM)
+set(TARGET_TRIPLE arm-linux-gnueabihf)
+
+set(CMAKE_ASM_COMPILER_TARGET ${TARGET_TRIPLE})
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+
+set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
+
+if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM|arm")
+    # C/C++ toolchain (installed on Ubuntu using apt-get gcc-arm-linux-gnueabihf g++-arm-linux-gnueabihf)
+    set(CMAKE_SYSROOT_COMPILE /usr/${TARGET_TRIPLE})
+
+    find_path(CXX_CROSS_INCLUDE_DIR NAMES ${TARGET_TRIPLE} PATHS /usr/${TARGET_TRIPLE}/include/ /usr/${TARGET_TRIPLE}/include/c++/ PATH_SUFFIXES 15 14 13 12 11 10 9 8 7 6 5 NO_DEFAULT_PATH)
+    add_compile_options(-I${CXX_CROSS_INCLUDE_DIR}/${TARGET_TRIPLE})
+endif()
--- a/inc/symcrypt_internal.h
+++ b/inc/symcrypt_internal.h
@ -1568,7 +1568,12 @@ typedef union _SYMCRYPT_GCM_SUPPORTED_BLOCKCIPHER_KEYS


 #if SYMCRYPT_CPU_ARM
-    #include <arm_neon.h>
+#include <arm_neon.h>
+#if SYMCRYPT_GNUC
+    #define __n128 uint32x4_t
+    #define __n64 uint64x1_t
+#endif
+
 #elif SYMCRYPT_CPU_ARM64

    #if SYMCRYPT_MS_VC
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@ -125,10 +125,12 @@ function(process_cppasm filepath outformat archdefine)
    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm))
        message(FATAL_ERROR "cppasm processing invoked with unrecognized outformat (${outformat})")
    endif()
-    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
        message(FATAL_ERROR "cppasm processing invoked with unrecognized archdefine (${archdefine})")
    endif()
+
    get_filename_component(rootpath ${filepath} DIRECTORY)
+
    get_filename_component(filestem ${filepath} NAME_WE) # filestem is the filename w/out extension
    string(TOUPPER ${outformat} outformatupper)
    string(TOUPPER ${archdefine} archdefineupper)
@ -179,10 +181,10 @@ function(process_symcryptasm filepath outformat archdefine callingconvention)
    if((NOT outformat STREQUAL gas) AND (NOT outformat STREQUAL masm) AND (NOT outformat STREQUAL armasm64))
        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized outformat (${outformat})")
    endif()
-    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64))
+    if((NOT archdefine STREQUAL amd64) AND (NOT archdefine STREQUAL x86) AND (NOT archdefine STREQUAL arm64) AND (NOT archdefine STREQUAL arm))
        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized archdefine (${archdefine})")
    endif()
-    if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec))
+    if((NOT callingconvention STREQUAL msft) AND (NOT callingconvention STREQUAL systemv) AND (NOT callingconvention STREQUAL aapcs64) AND (NOT callingconvention STREQUAL arm64ec) AND (NOT callingconvention STREQUAL aapcs32))
        message(FATAL_ERROR "symcryptasm processing invoked with unrecognized callingconvention (${callingconvention})")
    endif()
    get_filename_component(rootpath ${filepath} DIRECTORY)
@ -229,8 +231,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
            amd64/fdef369_asm-masm.asm
            amd64/fdef_mulx-masm.asm
            amd64/wipe-masm.asm
-            amd64/sha256xmm_asm-masm.asm            
-            amd64/sha256ymm_asm-masm.asm            
+            amd64/sha256xmm_asm-masm.asm
+            amd64/sha256ymm_asm-masm.asm
            amd64/sha512ymm_asm-masm.asm
            amd64/sha512ymm_avx512vl_asm-masm.asm)
        set_source_files_properties(
@ -239,8 +241,8 @@ if(WIN32 AND SYMCRYPT_USE_ASM)
            amd64/fdef369_asm-masm.asm
            amd64/fdef_mulx-masm.asm
            amd64/wipe-masm.asm
-            amd64/sha256xmm_asm-masm.asm            
-            amd64/sha256ymm_asm-masm.asm            
+            amd64/sha256xmm_asm-masm.asm
+            amd64/sha256ymm_asm-masm.asm
            amd64/sha512ymm_asm-masm.asm
            amd64/sha512ymm_avx512vl_asm-masm.asm
            PROPERTY LANGUAGE ASM_MASM)
@ -294,7 +296,7 @@ elseif(SYMCRYPT_USE_ASM) # Linux
            amd64/sha512ymm_asm-gas.asm
            amd64/sha512ymm_avx512vl_asm-gas.asm
            PROPERTY LANGUAGE ASM)
-    elseif(SYMCRYPT_TARGET_ARCH MATCHES "ARM64")
+    elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM64")
        process_symcryptasm(arm64/fdef_asm.symcryptasm gas arm64 aapcs64)
        process_symcryptasm(arm64/fdef369_asm.symcryptasm gas arm64 aapcs64)
        process_symcryptasm(arm64/wipe.symcryptasm gas arm64 aapcs64)
@ -308,6 +310,22 @@ elseif(SYMCRYPT_USE_ASM) # Linux
            arm64/fdef369_asm-gas.asm
            arm64/wipe-gas.asm
            PROPERTY LANGUAGE ASM)
+    elseif(SYMCRYPT_TARGET_ARCH STREQUAL "ARM")
+        process_symcryptasm(arm/aesasm.symcryptasm gas arm aapcs32)
+        process_symcryptasm(arm/fdef_asm.symcryptasm gas arm aapcs32)
+        process_symcryptasm(arm/wipe.symcryptasm gas arm aapcs32)
+
+        list(APPEND SOURCES_COMMON
+            arm/aesasm-gas.asm
+            arm/fdef_asm-gas.asm
+            arm/wipe-gas.asm)
+        set_source_files_properties(
+            arm/aesasm-gas.asm
+            arm/fdef_asm-gas.asm
+            arm/wipe-gas.asm
+            PROPERTY LANGUAGE ASM)
+        set_source_files_properties(
+            arm/fdef_asm-gas.asm PROPERTIES INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/arm)
    endif()
 endif()

--- a/lib/arm/aesasm.symcryptasm
+++ b/lib/arm/aesasm.symcryptasm
@ -0,0 +1,977 @@
+//
+//  AesAsm.cppasm   Assembler code for fast AES on ARM
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+// This code is derived from the AMD64 version of the AesFast
+// implementation, developed by Niels Ferguson. For questions
+// about the ARM specifics, contact Aaron Giles.
+//
+
+// #include "kxarm.h"
+
+#include "symcryptasm_shared.cppasm"
+
+#if SYMCRYPT_DEBUG
+SET(SYMCRYPT_CODE_VERSION, ((SYMCRYPT_CODE_VERSION_API * 65536) + SYMCRYPT_CODE_VERSION_MINOR))
+SET(SYMCRYPT_MAGIC_CONSTANT, HEX(53316D76) + SYMCRYPT_CODE_VERSION) // 0x53316D76 == 'S1mv'
+
+MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
+        ldr     temp1, [ptr, #offset]
+        subs    temp1, temp1, ptr
+        //mov32   temp2, =SYMCRYPT_MAGIC_CONSTANT
+        ldr     temp2, =SYMCRYPT_MAGIC_CONSTANT
+        cmp     temp1, temp2
+        beq     check_magic_label
+        //mov32   r0, 0x6d616763     // 'magc'
+        ldr     r0, =0x6d616763     // 'magc'
+        bl      SymCryptFatal
+check_magic_label:
+MACRO_END()
+#else
+MACRO_START(SYMCRYPT_CHECK_MAGIC, temp1, temp2, ptr, offset, check_magic_label)
+MACRO_END()
+#endif
+
+
+//        TTL   "Advanced Encryption Standard (AES)"
+
+//
+// typedef SYMCRYPT_ALIGN_STRUCT _SYMCRYPT_AES_EXPANDED_KEY {
+//     SYMCRYPT_ALIGN BYTE RoundKey[29][4][4];
+//            Round keys, first the encryption round keys in encryption order,
+//            followed by the decryption round keys in decryption order.
+//            The first decryption round key is the last encryption round key.
+//            AES-256 has 14 rounds and thus 15 round keys for encryption and 15
+//            for decryption. As they share one round key, we need room for 29.
+//     BYTE   (*lastEncRoundKey)[4][4];       Pointer to last encryption round key
+//                                            also the first round key for decryption
+//     BYTE   (*lastDecRoundKey)[4][4];       Pointer to last decryption round key.
+//
+//     SYMCRYPT_MAGIC_FIELD
+// } SYMCRYPT_AES_EXPANDED_KEY, *PSYMCRYPT_AES_EXPANDED_KEY
+//
+
+#define SYMCRYPT_AES_EXPANDED_KEY_RoundKey          (0)
+#define SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey   (29*4*4)
+#define SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey   (29*4*4+4)
+
+#if SYMCRYPT_DEBUG
+#define SYMCRYPT_AES_EXPANDED_KEY_magic             (29*4*4+4+4)
+#endif
+
+MACRO_START(ENC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for encryption
+        // plus a key addition from the key pointer
+        //
+        // Input:
+        //   r0,r1,r2,r3 = current block
+        //   keyptr = pointer to current key
+        //   r12 = pointer to AesSboxMatrixMult
+        //
+        // Output:
+        //   r0,r1,r2,r3 = updated block
+        //   keyptr = updated to point to following key
+        //   r12 = unmodified
+        //
+        // Used:
+        //   r4,r5,r6,r7,lr are modified
+        //
+        // N.B. To make better use of ARM's barrel shifter, this code differs
+        //      from the AMD64 approach. The first lookups are not rotated;
+        //      instead all subsequent lookups are applied on top rotated, and
+        //      then a final rotation is performed to shift the bits into the
+        //      proper spot.
+        //
+
+        uxtb    r4, r0                          // extract individual bytes from r0
+        uxtb    r7, r0, ror #8                  //
+        uxtb    r6, r0, ror #16                 //
+        uxtb    r5, r0, ror #24                 //
+        ldr     r4, [r12, r4, lsl #2]           // perform lookups of each byte, leaving
+        ldr     r7, [r12, r7, lsl #2]           //   the values unrotated for now
+        ldr     r6, [r12, r6, lsl #2]           //
+        ldr     r5, [r12, r5, lsl #2]           //
+
+        uxtb    r0, r1                          // extract individual bytes from r1
+        uxtb    lr, r1, ror #8                  //   (with 1 more register we could do 4 at a time)
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     lr, [r12, lr, lsl #2]           //
+        eor     r5, r5, r0, ror #24             // exclusive-OR with previous
+        eor     r4, r4, lr, ror #24             //
+        uxtb    r0, r1, ror #16                 // extract remaining bytes from r1
+        uxtb    r1, r1, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        eor     r7, r7, r0, ror #24             // exclusive-OR with previous
+        eor     r6, r6, r1, ror #24             //
+
+        uxtb    r0, r2                          // extract individual bytes from r2
+        uxtb    r1, r2, ror #8                  //
+        uxtb    lr, r2, ror #16                 //
+        uxtb    r2, r2, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        ldr     lr, [r12, lr, lsl #2]           //
+        ldr     r2, [r12, r2, lsl #2]           //
+        eor     r6, r6, r0, ror #16             // exclusive-OR with previous
+        eor     r5, r5, r1, ror #16             //
+        eor     r4, r4, lr, ror #16             //
+        eor     r7, r7, r2, ror #16             //
+
+        uxtb    r0, r3                          // extract individual bytes from r3
+        uxtb    r1, r3, ror #8                  //
+        uxtb    r2, r3, ror #16                 //
+        uxtb    r3, r3, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        ldr     r2, [r12, r2, lsl #2]           //
+        ldr     r3, [r12, r3, lsl #2]           //
+        eor     r7, r7, r0, ror #8              // exclusive-OR with previous
+        eor     r6, r6, r1, ror #8              //
+        eor     r5, r5, r2, ror #8              //
+        eor     r4, r4, r3, ror #8              //
+
+        ldrd    r0, r1, [keyptr, #0]            // fetch key into r0-r3
+        ldrd    r2, r3, [keyptr, #8]            //
+        adds    keyptr, keyptr, #16             // increment key pointer
+        eors    r0, r0, r4                      // exclusive-OR the key and rotate into final
+        eor     r1, r1, r5, ror #8              //   position
+        eor     r2, r2, r6, ror #16             //
+        eor     r3, r3, r7, ror #24             //
+MACRO_END()
+
+MACRO_START(DEC_MIX, keyptr)
+        //
+        // Perform the unkeyed mixing function for decryption
+        //
+        // Input:
+        //   r0,r1,r2,r3 = current block
+        //   keyptr = pointer to current key
+        //   r12 = pointer to AesInvSboxMatrixMult
+        //
+        // Output:
+        //   r0,r1,r2,r3 = updated block
+        //   keyptr = updated to point to following key
+        //   r12 = unmodified
+        //
+        // Used:
+        //   r4,r5,r6,r7,lr are modified
+        //
+        // N.B. To make better use of ARM's barrel shifter, this code differs
+        //      from the AMD64 approach. The first lookups are not rotated;
+        //      instead all subsequent lookups are applied on top rotated, and
+        //      then a final rotation is performed to shift the bits into the
+        //      proper spot.
+        //
+
+        uxtb    r4, r0                          // extract individual bytes from r0
+        uxtb    r5, r0, ror #8                  //
+        uxtb    r6, r0, ror #16                 //
+        uxtb    r7, r0, ror #24                 //
+        ldr     r4, [r12, r4, lsl #2]           // perform lookups of each byte, leaving
+        ldr     r5, [r12, r5, lsl #2]           //   the values unrotated for now
+        ldr     r6, [r12, r6, lsl #2]           //
+        ldr     r7, [r12, r7, lsl #2]           //
+
+        uxtb    r0, r1                          // extract individual bytes from r1
+        uxtb    lr, r1, ror #8                  //   (with 1 more register we could do 4 at a time)
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     lr, [r12, lr, lsl #2]           //
+        eor     r5, r5, r0, ror #8              // exclusive-OR with previous
+        eor     r6, r6, lr, ror #8              //
+        uxtb    r0, r1, ror #16                 // extract remaining bytes from r1
+        uxtb    r1, r1, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        eor     r7, r7, r0, ror #8              // exclusive-OR with previous
+        eor     r4, r4, r1, ror #8              //
+
+        uxtb    r0, r2                          // extract individual bytes from r2
+        uxtb    r1, r2, ror #8                  //
+        uxtb    lr, r2, ror #16                 //
+        uxtb    r2, r2, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        ldr     lr, [r12, lr, lsl #2]           //
+        ldr     r2, [r12, r2, lsl #2]           //
+        eor     r6, r6, r0, ror #16             // exclusive-OR with previous
+        eor     r7, r7, r1, ror #16             //
+        eor     r4, r4, lr, ror #16             //
+        eor     r5, r5, r2, ror #16             //
+
+        uxtb    r0, r3                          // extract individual bytes from r3
+        uxtb    r1, r3, ror #8                  //
+        uxtb    r2, r3, ror #16                 //
+        uxtb    r3, r3, ror #24                 //
+        ldr     r0, [r12, r0, lsl #2]           // perform lookups
+        ldr     r1, [r12, r1, lsl #2]           //
+        ldr     r2, [r12, r2, lsl #2]           //
+        ldr     r3, [r12, r3, lsl #2]           //
+        eor     r7, r7, r0, ror #24             // exclusive-OR with previous
+        eor     r4, r4, r1, ror #24             //
+        eor     r5, r5, r2, ror #24             //
+        eor     r6, r6, r3, ror #24             //
+
+        ldrd    r0, r1, [keyptr, #0]            // fetch key into r0-r3
+        ldrd    r2, r3, [keyptr, #8]            //
+        adds    keyptr, keyptr, #16             // increment key pointer
+        eors    r0, r0, r4                      // exclusive-OR the key and rotate into final
+        eor     r1, r1, r5, ror #24             //   position
+        eor     r2, r2, r6, ror #16             //
+        eor     r3, r3, r7, ror #8              //
+MACRO_END()
+
+MACRO_START(AES_ENCRYPT, loopLabel)
+
+        //
+        // Input:
+        //   r0,r1,r2,r3 = plaintext
+        //   r8 = pointer to first round key to use
+        //   r9 = pointer to last key to use
+        //   r12 = pointer to AesSboxMatrixMult
+        //
+        // Output:
+        //   r4,r5,r6,r7 = ciphertext
+        //   r8 = modified to point to last key
+        //   r9 = unmodified
+        //   r12 = unmodified
+        //
+        // Used:
+        //   lr is also modified
+        //
+
+        //
+        // xor in first round key
+        //
+
+        ldrd    r4, r5, [r8, #0]                // fetch key in r4-r7
+        ldrd    r6, r7, [r8, #8]                //
+        eors    r0, r0, r4                      // exclusive-OR with the plaintext
+        eors    r1, r1, r5                      //
+        eors    r2, r2, r6                      //
+        eors    r3, r3, r7                      //
+
+        add     r8, r8, #16                     // point to second key
+
+loopLabel:
+        //
+        // Block is r0,r1,r2,r3
+        // r8 points to current round key
+        //
+
+        ENC_MIX r8                              // encrypt the block and increment key
+        cmp     r8, r9                          // are we at the end?
+        blo     loopLabel                       // loop until it is so
+
+        //
+        // Now for the final round
+        // We use the fact that SboxMatrixMult[0] table is also
+        // an Sbox table if you use the second element of each entry.
+        //
+        // Result is in r4,r5,r6,r7
+        //
+
+        add     r12, r12, #1                    // advance by 1 to point to second element
+        uxtb    r4, r0                          // extract individual bytes from r0
+        uxtb    r7, r0, ror #8                  //
+        uxtb    r6, r0, ror #16                 //
+        uxtb    r5, r0, ror #24                 //
+        ldrb    r4, [r12, r4, lsl #2]           // perform lookups of each byte, leaving
+        ldrb    r7, [r12, r7, lsl #2]           //   the values unrotated for now
+        ldrb    r6, [r12, r6, lsl #2]           //
+        ldrb    r5, [r12, r5, lsl #2]           //
+
+        uxtb    r0, r1                          // extract individual bytes from r1
+        uxtb    lr, r1, ror #8                  //   (with 1 more register we could do 4 at a time)
+        ldrb    r0, [r12, r0, lsl #2]           // perform lookups
+        ldrb    lr, [r12, lr, lsl #2]           //
+        orr     r5, r5, r0, lsl #8              // merge with previous
+        orr     r4, r4, lr, lsl #8              //
+        uxtb    r0, r1, ror #16                 // extract remaining bytes from r1
+        uxtb    r1, r1, ror #24                 //
+        ldrb    r0, [r12, r0, lsl #2]           // perform lookups
+        ldrb    r1, [r12, r1, lsl #2]           //
+        orr     r7, r7, r0, lsl #8              // merge with previous
+        orr     r6, r6, r1, lsl #8              //
+
+        uxtb    r0, r2                          // extract individual bytes from r2
+        uxtb    r1, r2, ror #8                  //
+        uxtb    lr, r2, ror #16                 //
+        uxtb    r2, r2, ror #24                 //
+        ldrb    r0, [r12, r0, lsl #2]           // perform lookups
+        ldrb    r1, [r12, r1, lsl #2]           //
+        ldrb    lr, [r12, lr, lsl #2]           //
+        ldrb    r2, [r12, r2, lsl #2]           //
+        orr     r6, r6, r0, lsl #16             // merge with previous
+        orr     r5, r5, r1, lsl #16             //
+        orr     r4, r4, lr, lsl #16             //
+        orr     r7, r7, r2, lsl #16             //
+
+        uxtb    r0, r3                          // extract individual bytes from r3
+        uxtb    r1, r3, ror #8                  //
+        uxtb    r2, r3, ror #16                 //
+        uxtb    r3, r3, ror #24                 //
+        ldrb    r0, [r12, r0, lsl #2]           // perform lookups
+        ldrb    r1, [r12, r1, lsl #2]           //
+        ldrb    r2, [r12, r2, lsl #2]           //
+        ldrb    r3, [r12, r3, lsl #2]           //
+        orr     r7, r7, r0, lsl #24             // merge with previous
+        orr     r6, r6, r1, lsl #24             //
+        orr     r5, r5, r2, lsl #24             //
+        orr     r4, r4, r3, lsl #24             //
+        sub     r12, r12, #1                    // put r12 back to its original value
+
+        //
+        // xor in final round key
+        //
+
+        ldrd    r0, r1, [r9, #0]                // fetch key into r0-r3
+        ldrd    r2, r3, [r9, #8]                //
+        eors    r4, r4, r0                      // exclusive-OR the key and rotate into final
+        eor     r5, r1, r5, ror #8              //   position
+        eor     r6, r2, r6, ror #16             //
+        eor     r7, r3, r7, ror #24             //
+MACRO_END()
+
+MACRO_START(AES_DECRYPT, loopLabel)
+
+        //
+        // Input:
+        //   r0,r1,r2,r3 = ciphertext
+        //   r8 = pointer to first round key to use
+        //   r9 = pointer to last key to use
+        //   r10 = pointer to InvSbox
+        //   r12 = pointer to InvSboxMatrixMult
+        //
+        // Output:
+        //   r4,r5,r6,r7 = plaintext
+        //   r8 = modified to point to last key
+        //   r9 = unmodified
+        //   r10 = unmodified
+        //   r12 = unmodified
+        //
+        // Used:
+        //   lr is also modified
+        //
+
+        //
+        // xor in first round key
+        //
+        ldrd    r4, r5, [r8, #0]                // fetch key in r4-r7
+        ldrd    r6, r7, [r8, #8]                //
+        eors    r0, r0, r4                      // exclusive-OR with the plaintext
+        eors    r1, r1, r5                      //
+        eors    r2, r2, r6                      //
+        eors    r3, r3, r7                      //
+
+        add     r8, r8, #16                     // point to second key
+
+loopLabel:
+        //
+        // Block is r0, r1, r2, r3
+        // r8 points to current round key
+        //
+
+        DEC_MIX r8                              // decrypt the block and increment key
+        cmp     r8, r9                          // are we at the end?
+        blo     loopLabel                       // loop until it is so
+
+        //
+        // Now for the final round
+        // Result is in r4, r5, r6, r7
+        //
+
+        uxtb    r4, r0                          // extract individual bytes from r0
+        uxtb    r5, r0, ror #8                  //
+        uxtb    r6, r0, ror #16                 //
+        uxtb    r7, r0, ror #24                 //
+        ldrb    r4, [r10, r4]                   // perform lookups of each byte, leaving
+        ldrb    r5, [r10, r5]                   //   the values unrotated for now
+        ldrb    r6, [r10, r6]                   //
+        ldrb    r7, [r10, r7]                   //
+
+        uxtb    r0, r1                          // extract individual bytes from r1
+        uxtb    lr, r1, ror #8                  //   (with 1 more register we could do 4 at a time)
+        ldrb    r0, [r10, r0]                   // perform lookups
+        ldrb    lr, [r10, lr]                   //
+        orr     r5, r5, r0, lsl #24             // merge with previous
+        orr     r6, r6, lr, lsl #24             //
+        uxtb    r0, r1, ror #16                 // extract remaining bytes from r1
+        uxtb    r1, r1, ror #24                 //
+        ldrb    r0, [r10, r0]                   // perform lookups
+        ldrb    r1, [r10, r1]                   //
+        orr     r7, r7, r0, lsl #24             // merge with previous
+        orr     r4, r4, r1, lsl #24             //
+
+        uxtb    r0, r2                          // extract individual bytes from r2
+        uxtb    r1, r2, ror #8                  //
+        uxtb    lr, r2, ror #16                 //
+        uxtb    r2, r2, ror #24                 //
+        ldrb    r0, [r10, r0]                   // perform lookups
+        ldrb    r1, [r10, r1]                   //
+        ldrb    lr, [r10, lr]                   //
+        ldrb    r2, [r10, r2]                   //
+        orrs    r6, r6, r0, lsl #16             // merge with previous
+        orr     r7, r7, r1, lsl #16             //
+        orr     r4, r4, lr, lsl #16             //
+        orr     r5, r5, r2, lsl #16             //
+
+        uxtb    r0, r3                          // extract individual bytes from r3
+        uxtb    r1, r3, ror #8                  //
+        uxtb    r2, r3, ror #16                 //
+        uxtb    r3, r3, ror #24                 //
+        ldrb    r0, [r10, r0]                   // perform lookups
+        ldrb    r1, [r10, r1]                   //
+        ldrb    r2, [r10, r2]                   //
+        ldrb    r3, [r10, r3]                   //
+        orr     r7, r7, r0, lsl #8              // merge with previous
+        orr     r4, r4, r1, lsl #8              //
+        orr     r5, r5, r2, lsl #8              //
+        orr     r6, r6, r3, lsl #8              //
+
+        //
+        // xor in final round key
+        //
+
+        ldrd    r0, r1, [r9, #0]                // fetch key into r0-r3
+        ldrd    r2, r3, [r9, #8]                //
+        eors    r4, r4, r0                      // exclusive-OR the key and rotate into final
+        eor     r5, r1, r5, ror #24             //   position
+        eor     r6, r2, r6, ror #16             //
+        eor     r7, r3, r7, ror #8              //
+
+MACRO_END()
+
+
+//
+// VOID
+// SYMCRYPT_CALL
+// SymCryptAesEncrypt( _In_                                         PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                     _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )   PCBYTE                      pbPlaintext,
+//                     _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbCiphertext )
+//
+//        NESTED_ENTRY    SymCryptAesEncryptAsm
+FUNCTION_START(SymCryptAesEncryptAsm, 3, 12)
+
+        //
+        // Input parameters:
+        //   r0 = pExpandedKey
+        //   r1 = pbPlaintext
+        //   r2 = pbCiphertext
+        //
+
+//        push {r2, r4-r11, lr}
+        push {r2}
+
+        //
+        // Stack layout:
+        //   [sp] = r2 = pbCipherText
+        //
+
+        SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesEncryptAsm_check_magic_label
+
+        ldr     r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last key
+        mov     r8, r0                          // r8 = first key
+        // mov32   r12, SymCryptAesSboxMatrixMult  // r12 = matrix mult table
+        ldr   r12, =SymCryptAesSboxMatrixMult  // r12 = matrix mult table
+
+        ldr     r0, [r1, #0]                    // load the plaintext
+        ldr     r2, [r1, #8]                    //
+        ldr     r3, [r1, #12]                   //
+        ldr     r1, [r1, #4]                    //
+
+        AES_ENCRYPT SymCryptAesEncryptAsm_loopLabel
+        //
+        // Plaintext in r0, r1, r2, r3
+        // r8 points to first round key to use
+        // r9 is last key to use (unchanged)
+        // r12 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in r4, r5, r6, r7
+        //
+
+        ldr     r0, [sp]                        // recover pbCipherText
+        str     r4, [r0, #0]                    // store the encrypted data
+        str     r5, [r0, #4]                    //
+        str     r6, [r0, #8]                    //
+        str     r7, [r0, #12]                   //
+
+//        pop {r2, r4-r11, pc}             // return
+        pop {r2}
+
+//        NESTED_END      SymCryptAesEncryptAsm
+FUNCTION_END(SymCryptAesEncryptAsm)
+
+//
+// VOID
+// SYMCRYPT_CALL
+// SymCryptAesDecrypt( _In_                                         PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//                     _In_reads_bytes_( SYMCRYPT_AES_BLOCK_LEN )   PCBYTE                      pbCiphertext,
+//                     _Out_writes_bytes_( SYMCRYPT_AES_BLOCK_LEN ) PBYTE                       pbPlaintext )
+//
+//        NESTED_ENTRY    SymCryptAesDecryptAsm
+FUNCTION_START(SymCryptAesDecryptAsm, 3, 12)
+
+        //
+        // Input parameters:
+        //   r0 = pExpandedKey
+        //   r1 = pbCiphertext
+        //   r2 = pbPlaintext
+        //
+
+        // push {r2, r4-r11, lr}
+        push {r2}
+
+        //
+        // Stack layout:
+        //   [sp] = r2 = pbPlaintext
+        //
+
+        SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesDecryptAsm_check_magic_label
+
+        ldr     r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r8 = first key
+        ldr     r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last key
+        // mov32   r10, SymCryptAesInvSbox         // r10 = inverse sbox table
+        // mov32   r12, SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
+        ldr   r10, =SymCryptAesInvSbox         // r10 = inverse sbox table
+        ldr   r12, =SymCryptAesInvSboxMatrixMult // r11 = inverse matrix mult table
+
+        ldr     r0, [r1, #0]                    // load the ciphertext
+        ldr     r2, [r1, #8]                    //
+        ldr     r3, [r1, #12]                   //
+        ldr     r1, [r1, #4]                    //
+
+        AES_DECRYPT SymCryptAesDecryptAsm_loopLabel
+        //
+        // Ciphertext in r0, r1, r2, r3
+        // r8 points to first round key to use
+        // r9 is last key to use (unchanged)
+        // r10 points to InvSbox (unchanged)
+        // r12 points to InvSboxMatrixMult (unchanged)
+        // Ciphertext ends up in r4, r5, r6, r7
+        //
+
+        ldr     r0, [sp]                        // recover pbPlaintext
+        str     r4, [r0, #0]                    // store the decrypted data
+        str     r5, [r0, #4]                    //
+        str     r6, [r0, #8]                    //
+        str     r7, [r0, #12]                   //
+
+        pop {r2}
+        // pop {r2, r4-r11, pc}             // return
+
+//        NESTED_END      SymCryptAesDecryptAsm
+FUNCTION_END(SymCryptAesDecryptAsm)
+
+
+//
+// VOID
+// SYMCRYPT_CALL
+// SymCryptAesCbcEncrypt(
+//     _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//     _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbChainingValue,
+//     _In_reads_bytes_( cbData )                  PCBYTE                      pbSrc,
+//     _Out_writes_bytes_( cbData )                PBYTE                       pbDst,
+//                                                 SIZE_T                      cbData )
+
+//        NESTED_ENTRY    SymCryptAesCbcEncryptAsm
+FUNCTION_START(SymCryptAesCbcEncryptAsm, 4, 12)
+
+        //
+        // Input parameters:
+        //   r0 = pExpandedKey
+        //   r1 = pbChainingValue
+        //   r2 = pbSrc
+        //   r3 = pbDst
+        //   [sp] = cbData
+        //
+
+        // push {r0-r2, r4-r11, lr}
+        push {r0-r2}
+        sub sp, sp, #16
+
+        //
+        // Stack layout:
+        //   [sp] = pbSrc
+        //   [sp+4] = pbSrcEnd
+        //   [sp+16] = r0 = pbExpandedKey
+        //   [sp+20] = r1 = pbChainingValue
+        //   [sp+24] = r2 = pbSrc
+        //   [sp+64] = cbData
+        //
+
+        SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcEncryptAsm_check_magic_label
+
+        pld     [r2]                    // prefetch source data
+        ldr     r4, [sp, #64]           // r4 = cbData
+        mov     r8, r2                  // r8 = pbSrc on loop entry
+        ldr     r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
+        mov     r10, r3                 // r10 = pbDst
+        // mov32   r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
+        ldr     r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
+        bics    r4, r4, #15             // r4 &= ~15
+        beq     SymCryptAesCbcEncryptNoData // skip if no data
+        adds    r4, r4, r2              // r4 = pbSrc + cbData
+        // strd    r2, r4, [sp]            // save pbSrc/pbSrcEnd at [sp]
+        str     r2, [sp]
+        str     r4, [sp, #4]
+        pld     [r8, #32]               // prefetch source data
+        ldr     r4, [r1, #0]            // load chaining state from pbChainingValue
+        ldr     r5, [r1, #4]            //
+        ldr     r6, [r1, #8]            //
+        ldr     r7, [r1, #12]           //
+
+SymCryptAesCbcEncryptAsmLoop:
+        //
+        // Loop register setup
+        //   r4,r5,r6,r7 = chaining state
+        //   r8 = pbSrc
+        //   r9 = last round key to use
+        //   r10 = pbDst
+        //   r12 = SboxMatrixMult
+        //
+
+        ldr     r0, [r8]                // read next 16 bytes of plaintext
+        ldr     r1, [r8, #4]            //
+        ldr     r2, [r8, #8]            //
+        ldr     r3, [r8, #12]           //
+        pld     [r8, #64]               // prefetch source data
+        add     r8, r8, #16             // pbSrc += 16
+        str     r8, [sp]                // save it
+
+        eors    r0, r0, r4              // exclusive-OR against chaining value
+        eors    r1, r1, r5              //
+        eors    r2, r2, r6              //
+        eors    r3, r3, r7              //
+
+        ldr     r8, [sp, #16]           // r8 = first round key
+        AES_ENCRYPT SymCryptAesCbcEncryptAsm_loopLabel
+        //
+        // Plaintext in r0, r1, r2, r3
+        // r8 points to first round key to use
+        // r9 is last key to use (unchanged)
+        // r12 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in r4, r5, r6, r7
+        //
+
+        // ldrd    r8, r0, [sp]            // fetch pbSrc/pbSrcEnd
+        ldr     r8, [sp]
+        ldr     r0, [sp, #4]
+
+        str     r4, [r10, #0]           // write ciphertext
+        str     r5, [r10, #4]           //
+        str     r6, [r10, #8]           //
+        str     r7, [r10, #12]          //
+        add     r10, r10, #16           // pbDst += 16
+
+        cmp     r8, r0                  // are we at the end of source?
+        blo     SymCryptAesCbcEncryptAsmLoop // loop until we are
+
+        ldr     r0, [sp, #20]           // r0 = pbChainingValue
+        str     r4, [r0, #0]            // update the chaining value
+        str     r5, [r0, #4]            //
+        str     r6, [r0, #8]            //
+        str     r7, [r0, #12]           //
+
+SymCryptAesCbcEncryptNoData:
+
+        add sp, sp, #16
+        pop {r0-r2}
+        // pop {r0-r2, r4-r11, pc}  // return
+
+//        NESTED_END      SymCryptAesCbcEncryptAsm
+FUNCTION_END(SymCryptAesCbcEncryptAsm)
+
+
+//
+// VOID
+// SYMCRYPT_CALL
+// SymCryptAesCbcDecrypt(
+//     _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//     _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbChainingValue,
+//     _In_reads_bytes_( cbData )                  PCBYTE                      pbSrc,
+//     _Out_writes_bytes_( cbData )                PBYTE                       pbDst,
+//                                                 SIZE_T                      cbData )
+
+//        NESTED_ENTRY    SymCryptAesCbcDecryptAsm
+FUNCTION_START(SymCryptAesCbcDecryptAsm, 4, 12)
+
+        //
+        // Input parameters:
+        //   r0 = pExpandedKey
+        //   r1 = pbChainingValue
+        //   r2 = pbSrc
+        //   r3 = pbDst
+        //   [sp] = cbData
+        //
+
+        // push {r0-r2, r4-r11, lr}
+        push {r0-r2}
+        sub sp, sp, #32
+
+        //
+        // Stack layout:
+        //   [sp] = pbSrc
+        //   [sp+4] = pbDst
+        //   [sp+8] = pbSrcEnd
+        //   [sp+16] = saved chaining value
+        //   [sp+32] = r0 = pbExpandedKey
+        //   [sp+36] = r1 = pbChainingValue
+        //   [sp+40] = r2 = pbSrc
+        //   [sp+80] = cbData
+        //
+
+        SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCbcDecryptAsm_check_magic_label
+
+        ldr     r4, [sp, #80]           // r4 = cbData
+        bics    r4, r4, #15             // r4 &= ~15
+        beq     SymCryptAesCbcDecryptNoData // skip if no data
+
+        ldr     r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastDecRoundKey] // r9 = last enc round key (invariant)
+        ldr     r8, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey]
+
+        subs    r4, r4, #16
+        adds    r3, r3, r4
+        adds    r2, r2, r4
+        pld     [r2]                    // prefetch source data
+        str     r3, [sp, #4]
+        str     r2, [sp, #0]
+        str     r8, [sp, #8]
+
+        // mov32   r10, SymCryptAesInvSbox
+        // mov32   r12, SymCryptAesInvSboxMatrixMult
+        ldr   r10, =SymCryptAesInvSbox
+        ldr   r12, =SymCryptAesInvSboxMatrixMult
+
+        //
+        // Load last ciphertext block & save on stack (we need to put it in the pbChaining buffer later)
+        //
+        pld     [r2, #-32]              // prefetch source data
+        ldr     r0, [r2, #0]
+        ldr     r1, [r2, #4]
+        ldr     r3, [r2, #12]
+        ldr     r2, [r2, #8]
+
+        strd    r0, r1, [sp, #16]
+        strd    r2, r3, [sp, #24]
+
+        b       SymCryptAesCbcDecryptAsmLoopEntry
+
+SymCryptAesCbcDecryptAsmLoop:
+        // Loop register setup
+        // r13 = first round key to use
+        // r14 = pbSrc
+        // r15 = pbDst
+        // [callerP3Home] = pbSrcStart
+
+        // current ciphertext block (esi,edi,ebp,r8d)
+
+        ldr     r0, [r8, #-16]
+        ldr     r1, [r8, #-12]
+        ldr     r2, [r8, #-8]
+        ldr     r3, [r8, #-4]
+        pld     [r8, #-64]              // prefetch source data
+
+        eors    r4, r4, r0
+        eors    r5, r5, r1
+        eors    r6, r6, r2
+        eors    r7, r7, r3
+
+        str     r4, [lr, #0]
+        str     r5, [lr, #4]
+        str     r6, [lr, #8]
+        str     r7, [lr, #12]
+
+        sub     lr, lr, #16
+        sub     r8, r8, #16
+        str     r8, [sp]
+        str     lr, [sp, #4]
+
+SymCryptAesCbcDecryptAsmLoopEntry:
+
+        ldr     r8, [sp, #8]
+
+        AES_DECRYPT SymCryptAesCbcDecryptAsm_loopLabel
+
+        // ldrd    r8, lr, [sp, #0]
+        ldr     r8, [sp, #0]
+        ldr     lr, [sp, #4]
+        ldr     r0, [sp, #40]
+        cmp     r8, r0
+        bhi     SymCryptAesCbcDecryptAsmLoop
+
+        ldr     r8, [sp, #36]           // r8 = pbChainingValue
+        ldr     r0, [r8, #0]
+        ldr     r1, [r8, #4]
+        ldr     r2, [r8, #8]
+        ldr     r3, [r8, #12]
+
+        eors    r4, r4, r0
+        eors    r5, r5, r1
+        eors    r6, r6, r2
+        eors    r7, r7, r3
+
+        str     r4, [lr, #0]
+        str     r5, [lr, #4]
+        str     r6, [lr, #8]
+        str     r7, [lr, #12]
+
+        //
+        // Update the chaining value to the last ciphertext block
+        //
+        ldrd    r0, r1, [sp, #16]
+        ldrd    r2, r3, [sp, #24]
+        str     r0, [r8, #0]
+        str     r1, [r8, #4]
+        str     r2, [r8, #8]
+        str     r3, [r8, #12]
+
+SymCryptAesCbcDecryptNoData:
+
+        add sp, sp, #32
+        pop {r0-r2}
+        // pop {r0-r2, r4-r11, pc}
+
+//        NESTED_END      SymCryptAesCbcDecryptAsm
+FUNCTION_END(SymCryptAesCbcDecryptAsm)
+
+//
+// VOID
+// SYMCRYPT_CALL
+// SymCryptAesCtrMsb64(
+//     _In_                                        PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
+//     _In_reads_bytes_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE                       pbChainingValue,
+//     _In_reads_bytes_( cbData )                  PCBYTE                      pbSrc,
+//     _Out_writes_bytes_( cbData )                PBYTE                       pbDst,
+//                                                 SIZE_T                      cbData )
+
+// NESTED_ENTRY    SymCryptAesCtrMsb64Asm
+FUNCTION_START(SymCryptAesCtrMsb64Asm, 4, 12):
+
+        //
+        // Input parameters:
+        //   r0 = pExpandedKey
+        //   r1 = pbChainingValue
+        //   r2 = pbSrc
+        //   r3 = pbDst
+        //   [sp] = cbData
+        //
+
+        // push {r0-r2, r4-r11, lr}
+        push {r0-r2}
+        sub sp, sp, #32
+
+        //
+        // Stack layout:
+        //   [sp] = pbDst
+        //   [sp+4] = pbSrcEnd
+        //   [sp+16] = local copy of chaining data
+        //   [sp+32] = r0 = pbExpandedKey
+        //   [sp+36] = r1 = pbChainingValue
+        //   [sp+40] = r2 = pbSrc
+        //   [sp+80] = cbData
+        //
+
+        SYMCRYPT_CHECK_MAGIC r4, r5, r0, SYMCRYPT_AES_EXPANDED_KEY_magic, SymCryptAesCtrMsb64Asm_check_magic_label
+
+        pld     [r2]                    // prefetch source data
+        ldr     r4, [sp, #80]           // r4 = cbData
+        ldr     r9, [r0, #SYMCRYPT_AES_EXPANDED_KEY_lastEncRoundKey] // r9 = last enc round key (invariant)
+        mov     r10, r2                 // r10 = pbSrc
+        // mov32   r12, SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
+        ldr   r12, =SymCryptAesSboxMatrixMult // r12 = pointer to lookup table (invariant)
+        bics    r4, r4, #15             // r4 &= ~15
+        beq     SymCryptAesCtrMsb64NoData // skip if no data
+        adds    r4, r4, r2              // r4 = pbSrc + cbData
+        // strd    r3, r4, [sp]            // save pbDst/pbSrcEnd at [sp]
+        str     r3, [sp]            // save pbDst at [sp]
+        str     r4, [sp, #4]            // save pbSrcEnd at [sp, #4]
+
+        pld     [r10, #32]              // prefetch source data
+        mov     r3, r1                  // load chaining state from pbChainingValue
+        ldr     r0, [r3, #0]            //
+        ldr     r1, [r3, #4]            //
+        ldr     r2, [r3, #8]            //
+        ldr     r3, [r3, #12]           //
+
+        strd    r0, r1, [sp, #16]       // save a local copy
+        strd    r2, r3, [sp, #24]       //
+
+SymCryptAesCtrMsb64AsmLoop:
+        //
+        // Loop register setup
+        //   r0,r1,r2,r3 = chaining state
+        //   r8 = pbSrc
+        //   r9 = last round key to use
+        //   r10 = pbSrc
+        //   r12 = SboxMatrixMult
+        //
+
+        ldr     r8, [sp, #32]           // r8 = first round key
+        AES_ENCRYPT loopLabel
+        //
+        // Plaintext in r0, r1, r2, r3
+        // r8 points to first round key to use
+        // r9 is last key to use (unchanged)
+        // r12 points to SboxMatrixMult (unchanged)
+        // Ciphertext ends up in r4, r5, r6, r7
+        //
+
+        ldr     r0, [r10, #0]           // load plaintext
+        ldr     r1, [r10, #4]           //
+        ldr     r2, [r10, #8]           //
+        ldr     r3, [r10, #12]          //
+        pld     [r10, #64]              // prefetch source data
+
+        // ldrd    r8, lr, [sp]            // fetch pbDst/pbSrcEnd
+        ldr     r8, [sp]
+        ldr     lr, [sp, #4]
+
+        eors    r0, r0, r4              // exclusive-OR against encrypt results
+        eors    r1, r1, r5              //
+        eors    r2, r2, r6              //
+        eors    r3, r3, r7              //
+
+        str     r0, [r8, #0]            // store to destination
+        str     r1, [r8, #4]            //
+        str     r2, [r8, #8]            //
+        str     r3, [r8, #12]           //
+
+        ldrd    r0, r1, [sp, #16]       // load chaining state
+        ldrd    r2, r3, [sp, #24]       //
+
+        add     r8, r8, #16             // pbDst += 16
+        add     r10, r10, #16           // pbSrc += 16
+        str     r8, [sp]                // save pbDst
+
+        rev     r3, r3                  // reverse the second qword
+        rev     r2, r2                  //
+        adds    r3, r3, #1              // increment the counter
+        adcs    r2, r2, #0              //
+        rev     r3, r3                  // re-reverse the second word
+        rev     r2, r2                  //
+        strd    r2, r3, [sp, #24]       // write updated state
+
+        cmp     r10, lr                 // done?
+        blo     SymCryptAesCtrMsb64AsmLoop // loop until finished
+
+        ldr     r0, [sp, #36]           // get pbChainingValue
+        movs    r1, #0                  // get 0 in r1
+        str     r2, [r0, #8]            // write back modified part of chaining state
+        str     r3, [r0, #12]           //
+        // strd    r1, r1, [sp, #16]       // wipe the stack copy
+        // strd    r1, r1, [sp, #24]       //
+        str     r1, [sp, #16]
+        str     r1, [sp, #20]
+        str     r1, [sp, #24]
+        str     r1, [sp, #28]
+
+SymCryptAesCtrMsb64NoData:
+
+        add sp, sp, #32
+        pop {r0-r2}
+        // pop {r0-r2, r4-r11, pc}  // return
+
+//        NESTED_END      SymCryptAesCtrMsb64Asm
+FUNCTION_END(SymCryptAesCtrMsb64Asm)
--- a/lib/arm/fdef_asm.symcryptasm
+++ b/lib/arm/fdef_asm.symcryptasm
@ -0,0 +1,790 @@
+//
+//  fdef_asm.cppasm   Assembler code for large integer arithmetic in the default data format for the arm architecture
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+// #include "ksarm.h"
+
+#include "C_asm_shared.inc"
+
+// A digit consists of 4 words of 32 bits each
+
+//UINT32
+//SYMCRYPT_CALL
+// SymCryptFdefRawAdd(
+//   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc1,
+//   _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    pSrc2,
+//   _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     pDst,
+//                                                           UINT32      nDigits )
+//
+// Initial inputs to registers:
+//       pSrc1       -> r0
+//       pSrc2       -> r1
+//       pDst        -> r2
+//       nDigits     -> r3
+
+//    LEAF_ENTRY SymCryptFdefRawAddAsm
+FUNCTION_START(SymCryptFdefRawAddAsm, 4, 10)
+//    push     {r4-r9, lr}
+
+    neg     r3, r3                  // negate the digit count
+    mov     r8, #0                  // carry = r8 = 0
+    mov     r9, #0                  // r9 = 0
+
+SymCryptFdefRawAddAsmLoop:
+    rrxs    r8, r8                  // set the carry flag if bit[0] of r8 is set
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    adcs    r4, r4, r5
+    adcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    adcs    r4, r4, r5
+    adcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    adc     r8, r9, r9              // r8 = 1 if the carry flag is set
+
+    adds    r3, r3, #1              // Increment the digit count by one
+    bne     SymCryptFdefRawAddAsmLoop
+
+    mov     r0, r8                  // Set the return value equal to the carry
+
+//    pop      {r4-r9, pc}
+
+//    LEAF_END SymCryptFdefRawAddAsm
+FUNCTION_END(SymCryptFdefRawAddAsm)
+
+//UINT32
+//SYMCRYPT_CALL
+//SymCryptFdefRawSub(
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src1,
+//    _In_reads_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )   PCUINT32    Src2,
+//    _Out_writes_bytes_(nDigits * SYMCRYPT_FDEF_DIGIT_SIZE ) PUINT32     Dst,
+//                                                            UINT32      nDigits )
+//
+// Initial inputs to registers:
+//       pSrc1       -> r0
+//       pSrc2       -> r1
+//       pDst        -> r2
+//       nDigits     -> r3
+
+//    LEAF_ENTRY SymCryptFdefRawSubAsm
+FUNCTION_START(SymCryptFdefRawSubAsm, 4, 10)
+//    push     {r4-r9, lr}
+
+    neg     r3, r3                  // negate the digit count
+    mov     r8, #0                  // borrow = r8 = 0
+    mov     r9, #0                  // r9 = 0
+
+SymCryptFdefRawSubAsmLoop:
+    subs    r8, r9, r8              // if r8>0 then the "borrow flag" is set
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    sbcs    r4, r4, r5
+    sbcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    sbcs    r4, r4, r5
+    sbcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    sbc     r8, r9, r9              // If borrow=1, then r8 = -1 = 0xffffffff
+
+    adds    r3, r3, #1              // Increment the digit count by one
+    bne     SymCryptFdefRawSubAsmLoop
+
+    and     r0, r8, #1             // If r8>0, set the return value to 1
+
+//    pop      {r4-r9, pc}
+
+//    LEAF_END SymCryptFdefRawSubAsm
+FUNCTION_END(SymCryptFdefRawSubAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefMaskedCopy(
+//    _In_reads_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )      PCBYTE      pbSrc,
+//    _InOut_writes_bytes_( nDigits * SYMCRYPT_FDEF_DIGIT_SIZE )  PBYTE       pbDst,
+//                                                                UINT32      nDigits,
+//                                                                UINT32      mask )
+
+//    LEAF_ENTRY SymCryptFdefMaskedCopyAsm
+FUNCTION_START(SymCryptFdefMaskedCopyAsm, 4, 10)
+//    push     {r4-r9, lr}
+
+    neg     r2, r2                  // negate the digit count
+    mov     r9, #0                  // r9 = 0
+
+    subs    r4, r9, r3              // If (r3 > 0) clear the carry flag (i.e. borrow)
+    sbc     r3, r9, r9              // r3 = mask = 0xffffffff if the carry flag is clear
+    // orn     r9, r9, r3              // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
+    orr     r9, r9, r3
+    mvn     r9, r9
+
+    mov     r8, r1                  // save the destination pointer
+
+SymCryptFdefMaskedCopyAsmLoop:
+    ldmia   r0!, {r4, r6}           // Load two words of the source
+    ldmia   r1!, {r5, r7}           // Load two words of the destination
+    and     r4, r4, r3
+    and     r5, r5, r9
+    orr     r4, r4, r5
+    and     r6, r6, r3
+    and     r7, r7, r9
+    orr     r6, r6, r7
+    stmia   r8!, {r4, r6}           // Store the two words in the destination
+
+    ldmia   r0!, {r4, r6}           // Load two words of the source
+    ldmia   r1!, {r5, r7}           // Load two words of the destination
+    and     r4, r4, r3
+    and     r5, r5, r9
+    orr     r4, r4, r5
+    and     r6, r6, r3
+    and     r7, r7, r9
+    orr     r6, r6, r7
+    stmia   r8!, {r4, r6}           // Store the two words in the destination
+
+    adds    r2, r2, #1              // Increment the digit count by one
+    bne     SymCryptFdefMaskedCopyAsmLoop
+
+    // Done, no return value
+
+//    pop      {r4-r9, pc}
+
+//    LEAF_END SymCryptFdefMaskedCopyAsm
+FUNCTION_END(SymCryptFdefMaskedCopyAsm)
+
+//VOID
+//SYMCRYPT_CALL
+//SymCryptFdefRawMul(
+//    _In_reads_(nWords1)             PCUINT32    pSrc1,
+//                                    UINT32      nDigits1,
+//    _In_reads_(nWords2)             PCUINT32    pSrc2,
+//                                    UINT32      nDigits2,
+//    _Out_writes_(nWords1 + nWords2) PUINT32     pDst )
+//
+// Initial inputs to registers:
+//       pSrc1       -> r0
+//       nDigits1    -> r1
+//       pSrc2       -> r2
+//       nDigits2    -> r3
+//       pDst        -> In the stack
+//
+// Basic structure:
+//   for each 2 words in Src1:
+//       Dst += Src2 * (2 words of Src1)
+//
+// Register assignments
+//       r0  = pSrc1 (moving forward one word every outer loop)
+//       r1  = negated word count of pSrc1
+//       r2  = pSrc2 (moving forward one *digit* every inner loop)
+//       r3  = negated digit count of pSrc2 and pDst
+//       r4  = pDst (moving forward one *digit* every inner loop)
+//       r5  = Stored pDst (moving forward one word every outer loop)
+//       r6, r7  = Current words loaded from pSrc1
+//       r8, r9  = Current words loaded from pSrc2
+//       <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
+//
+// Stack assignments
+#define pSrc2 0               // Stored pSrc2 in stack
+#define nDigits2 4               // Stored negated digit count of pSrc2 in stack
+
+
+//    LEAF_ENTRY SymCryptFdefRawMulAsm
+FUNCTION_START(SymCryptFdefRawMulAsm, 4, 13)
+    // push         {r4-r12, lr}
+    sub     sp, sp, #8
+
+    lsl     r1, r1, #2                  // Calculate word count
+
+    ldr     r4, [sp, #(8+4*10)]         // load pDst
+
+    neg     r1, r1                      // negate nWords1
+    neg     r3, r3                      // negate nDigits2
+
+    mov     r5, r4                      // store pDst
+    str     r2, [sp, #pSrc2]            // store pSrc2
+    str     r3, [sp, #nDigits2]         // store -nDigits2 for later
+
+    //
+    // First iteration of main loop (no adding of previous values from pDst)
+    //
+    mov     r11, #0                     // Setting r11 = 0
+    mov     r12, #0                     // and r12 = 0
+    ldmia   r0!, {r6, r7}               // Load two words from pSrc1
+
+SymCryptFdefRawMulAsmLoopInner1:
+
+    adds    r3, r3, #1                  // move one digit up
+
+    ldmia   r2!, {r8, r9}               // Load two words from pSrc2
+
+    mov     r10, #0                     // Setting r10 = 0
+    umaal   r10, r11, r6, r8            // <r11:r10> = r6 * r8 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r8            // <r12:r11> = r7 * r8 + r11
+
+    mov     r10, #0                     // Setting r10 = 0
+    umaal   r10, r11, r6, r9            // <r11:r10> = r6 * r9 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r9            // <r12:r11> = r7 * r9 + r11
+
+    ldmia   r2!, {r8, r9}               // Load two words from pSrc2
+
+    mov     r10, #0                     // Setting r10 = 0
+    umaal   r10, r11, r6, r8            // <r11:r10> = r6 * r8 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r8            // <r12:r11> = r7 * r8 + r11
+
+    mov     r10, #0                     // Setting r10 = 0
+    umaal   r10, r11, r6, r9            // <r11:r10> = r6 * r9 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r9            // <r12:r11> = r7 * r9 + r11
+
+    bne     SymCryptFdefRawMulAsmLoopInner1
+
+    stmia   r4, {r11, r12}              // Store the top two words in the destination
+
+    add     r1, r1, #2                  // move two words up
+    add     r5, r5, #8                  // move start of pDst two words up
+
+    //
+    // MAIN LOOP
+    //
+SymCryptFdefRawMulAsmLoopOuter:
+    ldr     r3, [sp, #nDigits2]         // set -nDigits2
+    ldr     r2, [sp, #pSrc2]            // set pSrc2
+    mov     r4, r5                      // set pDst
+
+    mov     r11, #0                     // Setting r11 = 0
+    mov     r12, #0                     // and r12 = 0
+    ldmia   r0!, {r6, r7}               // Load two words from pSrc1
+
+SymCryptFdefRawMulAsmLoopInner:
+
+    adds    r3, r3, #1                  // move one digit up
+
+    ldmia   r2!, {r8, r9}               // Load two words from pSrc2
+
+    ldr     r10, [r4]                   // load 1 word from pDst
+    umaal   r10, r11, r6, r8            // <r11:r10> = r6 * r8 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r8            // <r12:r11> = r7 * r8 + r11
+
+    ldr     r10, [r4]                   // load 1 word from pDst
+    umaal   r10, r11, r6, r9            // <r11:r10> = r6 * r9 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r9            // <r12:r11> = r7 * r9 + r11
+
+    ldmia   r2!, {r8, r9}               // Load two words from pSrc2
+
+    ldr     r10, [r4]                   // load 1 word from pDst
+    umaal   r10, r11, r6, r8            // <r11:r10> = r6 * r8 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r8            // <r12:r11> = r7 * r8 + r11
+
+    ldr     r10, [r4]                   // load 1 word from pDst
+    umaal   r10, r11, r6, r9            // <r11:r10> = r6 * r9 + r10 + r11
+    str     r10, [r4], #4               // Store to destination
+    umaal   r11, r12, r7, r9            // <r12:r11> = r7 * r9 + r11
+
+    bne     SymCryptFdefRawMulAsmLoopInner
+
+    adds    r1, r1, #2                  // move two words up
+    add     r5, r5, #8                  // move start of pDst two words up
+
+    stmia   r4, {r11, r12}              // Store the top two words in the destination
+
+    bne     SymCryptFdefRawMulAsmLoopOuter
+
+    // Done, no return value
+
+    add     sp, sp, #8
+    // pop          {r4-r12, pc}
+
+//    LEAF_END SymCryptFdefRawMulAsm
+FUNCTION_END(SymCryptFdefRawMulAsm)
+
+    // Macro for the first loop of the first pass of RawSquareAsm.
+    // It takes one word from the source, multiplies it with the mulword,
+    // adds the high level word of the previous macro call, and stores it into
+    // the destination.
+    //
+    // No word is taken from the destination// thus r10 is always set to 0.
+    //
+    // No carry flag is propagated from the previous macro call as the maximum is
+    // (2^32-1)^2 + 2^32-1 = 2^64 - 2^32
+MACRO_START(SQR_SINGLEADD_32, index)
+
+        mov     r10, #0
+        ldr     r8, [r2, #4*index]     // pSrc[i+j]
+
+        umaal   r10, r11, r6, r8        // <r11:r10> = r6 * r8 + r10 + r11
+
+        str     r10, [r4, #4*index]    // Store to destination
+MACRO_END()
+
+    // Macro for the remaining loops of the first pass of RawSquareAsm.
+    // The only difference to the above is that it also adds the word loaded
+    // from the destination buffer.
+    //
+    // No carry flag is propagated from the previous macro call as the maximum is
+    // (2^32-1)^2 + 2(2^32-1) = 2^64 - 1
+MACRO_START(SQR_DOUBLEADD_32, index)
+
+        ldr     r8, [r2, #4*index]     // pSrc[i+j]
+        ldr     r10, [r4, #4*index]    // pDst[2*(i+j)]
+
+        umaal   r10, r11, r6, r8        // <r11:r10> = r6 * r8 + r10 + r11
+
+        str     r10, [r4, #4*index]    // Store to destination
+
+MACRO_END()
+
+    // Macro for the third pass loop of RawSquareAsm.
+    // It takes one mulword from the source, squares it, and
+    // adds it to the even columns of the destination. The carries are propagated
+    // to the odd columns.
+    //
+    // Here we can have a (1-bit) carry to the next call because the maximum value for
+    // a pair of columns is (2^32-1)^2+(2^64-1)+1 = 2^65 - 2^33 + 1 < 2^65 - 1
+MACRO_START(SQR_DIAGONAL_PROP, index)
+        ldr     r6, [r0, #4*index]     // mulword
+
+        umull   r10, r11, r6, r6
+
+        ldr     r8, [r4, #8*index]     // Load
+        ldr     r9, [r4, #8*index + 4] // Load
+
+        // Adding the square to the even column
+        adcs    r10, r10, r8            // carry from previous and update the flags
+
+        // Propagating the sum to the next column
+        adcs    r11, r11, r9            // This can generate a carry
+
+        str     r10, [r4, #8*index]    // Store
+        str     r11, [r4, #8*index + 4]// Store
+MACRO_END()
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptFdefRawSquareAsm(
+//   _In_reads_(nDgigits*SYMCRYPT_FDEF_DIGIT_NUINT32)    PCUINT32    pSrc,
+//                                                       UINT32      nDigits,
+//   _Out_writes_(2*nWords)                              PUINT32     pDst )
+//
+// Initial inputs to registers:
+//       pSrc        -> r0
+//       nDigits     -> r1
+//       pDst        -> r2
+//
+// Register assignments
+//       r0  = pSrc
+//       r1  = negated word count of pSrc
+//       r2  = pSrc (moving forward one digit / 4 words every inner loop)
+//       r3  = negated digit count of pSrc
+//       r4  = pDst (moving forward one digit every inner loop)
+//       r5  = pDst (moving forward one word every outer loop)
+//       r6  = mulword from pSrc
+//       r7  = Stored negated digit count of pSrc
+//       r8  = Current words loaded from pSrc
+//       r9  = Cyclic counter for the jumps
+//       r10, r11 = "64-bit" sliding register to hold the result of multiplies,
+//                   r10 also receives a word from pDst
+//       r12 = Negated digit counter of pSrc (updated every 4 iterations of main loop)
+//
+// Stack assignments
+#define pDstSq 0   // Stored pDst in stack
+#define pSrc 4   // Stored pSrc in stack
+
+
+//    LEAF_ENTRY SymCryptFdefRawSquareAsm
+FUNCTION_START(SymCryptFdefRawSquareAsm, 3, 13)
+    // push         {r4-r12, lr}
+    sub     sp, sp, #8
+
+
+    mov     r3, r1                      // digit count into r3
+
+    lsl     r1, r1, #2                  // Calculate word count
+
+    neg     r1, r1                      // negate nWords
+    neg     r3, r3                      // negate nDigitsSq
+
+    mov     r4, r2                      // pDst
+    mov     r5, r2                      // store pDst
+
+    str     r0, [sp, #pSrc]             // store pSrc
+    str     r5, [sp, #pDstSq]           // store pDst
+    mov     r7, r3                      // store -nDigits for later
+    mov     r12, r3                     // Negated digit counter of pSrc
+
+    mov     r2, r0                      // inner loop pSrc
+
+    //
+    // First iteration of main loop (no adding of previous values from pDst)
+    //
+    ands    r11, r11, #0                // Clearing the carry flag and setting r11 = 0
+    ldr     r6, [r0]                    // load the first word from pSrc1
+    str     r11, [r4]                   // store 0 for the first word
+
+    b       SymCryptFdefRawSquareAsmInnerLoopInit_Word1
+
+SymCryptFdefRawSquareAsmInnerLoopInit_Word0:
+    SQR_SINGLEADD_32    0
+
+SymCryptFdefRawSquareAsmInnerLoopInit_Word1:
+    SQR_SINGLEADD_32    1
+
+    SQR_SINGLEADD_32    2
+
+    SQR_SINGLEADD_32    3
+
+
+    add     r2, r2, #16
+    add     r4, r4, #16
+
+    adds    r3, r3, #1                  // move one digit up
+    bne     SymCryptFdefRawSquareAsmInnerLoopInit_Word0
+
+    str     r11, [r4]                   // Store the next word into the destination
+    add     r1, r1, #2                  // move two words up (so we stop when real word count is "-1")
+    mov     r9, #1                      // Cyclic counter
+
+    //
+    // MAIN LOOP
+    //
+SymCryptFdefRawSquareAsmOuterLoop:
+
+    add     r5, r5, #4                  // move start of pDst one word up
+
+    mov     r3, r12                     // set -nDigits
+    mov     r2, r0                      // set pSrc
+    mov     r4, r5                      // set pDst
+
+    ands    r11, r11, #0                // Clearing the carry flag and setting r11 = 0
+    ldr     r6, [r0, r9, LSL #2]        // load the next word from pSrc
+
+    // Cyclic counter and jump logic
+    add     r9, r9, #1
+    cmp     r9, #1
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word1
+    cmp     r9, #2
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word2
+    cmp     r9, #3
+    beq     SymCryptFdefRawSquareAsmInnerLoop_Word3
+
+    // The following instructions are only executed when r9 == 4
+    mov     r9, #0                      // Set it to 0
+
+    add     r0, r0, #16                 // move start of pSrc 4 words up
+    add     r5, r5, #16                 // move pDst 4 words up
+
+    mov     r2, r0                      // set pSrc
+    mov     r4, r5                      // set pDst
+
+    adds    r3, r3, #1                  // add 1 digit
+    mov     r12, r3                     // set the new digit counter
+
+SymCryptFdefRawSquareAsmInnerLoop_Word0:
+    SQR_DOUBLEADD_32    0
+
+SymCryptFdefRawSquareAsmInnerLoop_Word1:
+    SQR_DOUBLEADD_32    1
+
+SymCryptFdefRawSquareAsmInnerLoop_Word2:
+    SQR_DOUBLEADD_32    2
+
+SymCryptFdefRawSquareAsmInnerLoop_Word3:
+    SQR_DOUBLEADD_32    3
+
+
+    add     r2, r2, #16
+    add     r4, r4, #16
+    adds    r3, r3, #1              // move one digit up
+    bne     SymCryptFdefRawSquareAsmInnerLoop_Word0
+
+    str     r11, [r4]               // Store the next word into the destination
+
+    adds    r1, r1, #1              // move one word up
+    bne     SymCryptFdefRawSquareAsmOuterLoop
+
+    eor     r11, r11, r11           // Setting r11 = 0
+    str     r11, [r5, #20]          // Store 0 to destination for the top word
+
+    // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Second Pass - Shifting all results 1 bit left
+    // Third Pass - Adding the squares on the even columns and propagating the sum
+    // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    mov     r3, r7                  // -nDigits
+    lsl     r3, r3, #1              // Double digits
+    ldr     r4, [sp, #pDstSq]       // pDst pointer
+    ands    r1, r1, #0              // Clear the flags
+    ands    r2, r2, #0              // Clear the flags
+
+SymCryptFdefRawSquareAsmSecondPass:
+    rrxs    r2, r2                  // set the carry flag if bit[0] of r2 is set
+
+    ldmia   r4, {r8, r9}
+    adcs    r8, r8, r8              // Shift left and add the carry
+    adcs    r9, r9, r9
+    stmia   r4!, {r8, r9}
+
+    ldmia   r4, {r10, r11}
+    adcs    r10, r10, r10           // Shift left and add the carry
+    adcs    r11, r11, r11
+    stmia   r4!, {r10, r11}
+
+    adc     r2, r1, r1
+
+    adds     r3, r3, #1             // move one digit up
+    bne     SymCryptFdefRawSquareAsmSecondPass
+
+
+
+    ldr     r0, [sp, #pSrc]         // src pointer
+    ldr     r4, [sp, #pDstSq]       // pDst pointer
+    // mov r3, r7                    // Use r7 as the digit counter
+    // ands    r1, r1, #0            // Clear the flags
+    ands    r2, r2, #0              // Clear the flags
+
+SymCryptFdefRawSquareAsmThirdPass:
+    rrxs    r2, r2                  // set the carry flag if bit[0] of r2 is set
+
+    SQR_DIAGONAL_PROP 0
+    SQR_DIAGONAL_PROP 1
+    SQR_DIAGONAL_PROP 2
+    SQR_DIAGONAL_PROP 3
+
+    adc     r2, r1, r1
+
+    add     r0, r0, #16             // One digit up (not updated in SQR_DIAGONAL_PROP)
+    add     r4, r4, #32             // Two digits up (not updated in SQR_DIAGONAL_PROP)
+
+    adds    r7, r7, #1              // move one digit up
+    bne     SymCryptFdefRawSquareAsmThirdPass
+
+    // Done, no return value
+
+    add     sp, sp, #8
+    // pop          {r4-r12, pc}
+
+//    LEAF_END SymCryptFdefRawSquareAsm
+FUNCTION_END(SymCryptFdefRawSquareAsm)
+
+//VOID
+//SymCryptFdefMontgomeryReduceAsm(
+//    _In_                            PCSYMCRYPT_MODULUS      pmMod,
+//    _Inout_                         PUINT32                 pSrc,
+//    _Out_                           PUINT32                 pDst )
+//
+// Initial inputs to registers:
+//       pmMod       -> r0
+//       pSrc        -> r1
+//       pDst        -> r2
+//
+// Register assignments
+//       r0  = pMod (moving forward one *digit* every inner loop)
+//       r1  = pSrc (moving forward one *digit* every inner loop)
+//       r2  = Stored pSrc (moving forward one word every outer loop)
+//       r3  = negated digit count of pSrc and pMod
+//       r4  = negated word count of pSrc
+//       r5, r6  = m = pSrc[i]*Inv64
+//       r7  = hc = high carry variable
+//       r8, r9 = Current words loaded from pMod
+//       <r12:r11:r10> = "96-bit" sliding register to hold the result of multiplies
+
+// Stack assignments
+#define pMod 0               // Stored pMod
+#define pDst 4               // Stored pDst
+#define nDigits 8               // Stored negated digit count of pSrc
+#define inv64 12              // Inv64 of modulus
+
+//    LEAF_ENTRY SymCryptFdefMontgomeryReduceAsm
+FUNCTION_START(SymCryptFdefMontgomeryReduceAsm, 3, 13)
+    // push         {r4-r12, lr}
+    sub     sp, sp, #16
+
+    str     r2, [sp, #pDst]                             // Store pDst in the stack
+    ldr     r3, [r0, #SymCryptModulusNdigitsOffsetArm]  // # of Digits
+    ldr     r5, [r0, #SymCryptModulusInv64OffsetArm]    // Inv64 of modulus
+    add     r0, r0, #SymCryptModulusValueOffsetArm      // pMod
+    str     r5, [sp, #inv64]                            // Store inv64 in the stack
+
+    lsl     r4, r3, #2                  // Multiply by 4 to get the number of words
+
+    neg     r3, r3                      // Negate the digit count
+    neg     r4, r4                      // Negate the word count
+
+    str     r0, [sp, #pMod]             // Store the pMod pointer
+    mov     r2, r1                      // Store the pSrc pointer
+    str     r3, [sp, #nDigits]          // Store the digit count for later
+
+    eor     r7, r7, r7                  // Set hc to 0
+
+    //
+    // Main loop
+    //
+SymCryptFdefMontgomeryReduceAsmOuter:
+    ldr     r3, [sp, #inv64]            // Inv64 of modulus
+
+    ldmia   r1, {r10, r12}              // Load two words from pSrc
+    ldmia   r0, {r8,r9}                 // Load two words from pMod
+    mov     r11, #0
+    mul     r5, r10, r3                 // <31:0> bits of pSrc[i]*Inv64 = m1 (first multiplier)
+    umaal   r10, r11, r5, r8            // r11 <-- High( m1*pMod[0] + pSrc[i] )
+    umaal   r12, r11, r5, r9            // Calculate pSrc[i+1] = Low( m1*pMod[1] + pSrc[i+1] + High( m1*pMod[0] + pSrc[i] ))
+    mul     r6, r12, r3                 // <31:0> bits of pSrc[i+1]*Inv64 = m2
+
+    ldr     r3, [sp, #nDigits]          // Reset the digit counter
+    mov     r11, #0                     // Set c to 0
+    mov     r12, #0                     // Set c to 0
+
+SymCryptFdefMontgomeryReduceAsmInner:
+    adds    r3, r3, #1                  // Move one digit up (none of the commands updates the carry)
+
+    ldmia   r0!, {r8, r9}               // Load two words from pMod[]
+
+    ldr     r10, [r1]                   // pSrc[j]
+    umaal   r10, r11, r5, r8            // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
+    str     r10, [r1], #4               // pSrc[j] = (UINT32) c
+    umaal   r11, r12, r6, r8            // c = <r12:r11> = m2 * pMod[j] + c
+
+    ldr     r10, [r1]                   // pSrc[j]
+    umaal   r10, r11, r5, r9            // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
+    str     r10, [r1], #4               // pSrc[j] = (UINT32) c
+    umaal   r11, r12, r6, r9            // c = <r12:r11> = m2 * pMod[j] + c
+
+    ldmia   r0!, {r8, r9}               // Load two words from pMod[]
+
+    ldr     r10, [r1]                   // pSrc[j]
+    umaal   r10, r11, r5, r8            // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
+    str     r10, [r1], #4               // pSrc[j] = (UINT32) c
+    umaal   r11, r12, r6, r8            // c = <r12:r11> = m2 * pMod[j] + c
+
+    ldr     r10, [r1]                   // pSrc[j]
+    umaal   r10, r11, r5, r9            // c = <r11:r10> = m1 * pMod[j] + pSrc[j] + c
+    str     r10, [r1], #4               // pSrc[j] = (UINT32) c
+    umaal   r11, r12, r6, r9            // c = <r12:r11> = m2 * pMod[j] + c
+
+    bne     SymCryptFdefMontgomeryReduceAsmInner
+
+    mov     r8, #0                      // r8 = 0
+    mov     r9, #0                      // r9 = 0
+
+    ldmia   r1, {r5, r6}                // Load pSrc[nWords] and pSrc[nWords+1]
+
+    adds    r11, r11, r5                // c + pSrc[nWords]
+    adc     r8, r8, #0                  // Add the carry if any
+    adds    r11, r11, r7                // c + pSrc[nWords] + hc
+    adc     r8, r8, #0                  // Add the carry if any
+    str     r11, [r1], #4               // pSrc[nWords] = c
+
+    adds    r12, r12, r6                // c + pSrc[nWords+1]
+    adc     r9, r9, #0                  // Add the carry if any
+    adds    r12, r12, r8                // c + pSrc[nWords] + hc
+    adc     r7, r9, #0                  // Add the carry if any
+    str     r12, [r1]                   // pSrc[nWords+1] = c
+
+    adds    r4, r4, #2                  // Move two words up
+
+    add     r2, r2, #8                  // Move stored pSrc pointer two words up
+    ldr     r0, [sp, #pMod]             // Restore the pMod pointer
+    mov     r1, r2                      // Restore the pSrc pointer
+
+    bne     SymCryptFdefMontgomeryReduceAsmOuter
+
+    //
+    // Subtraction
+    //
+
+    // Prepare the pointers for subtract
+    mov     r0, r2                  // pSrc
+    mov     r11, r2                 // Store pSrc for later
+    ldr     r1, [sp, #pMod]         // pMod
+    ldr     r2, [sp, #pDst]         // pDst
+    ldr     r3, [sp, #nDigits]      // Reset the digit counter
+
+    mov     r10, r7                 // r10 = hc
+
+    mov     r8, #0                  // borrow = r8 = 0
+    mov     r9, #0                  // r9 = 0
+
+SymCryptFdefMontgomeryReduceRawSubAsmLoop:
+    subs    r8, r9, r8              // if r8>0 then the "borrow flag" is set
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    sbcs    r4, r4, r5
+    sbcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    ldmia   r0!, {r4, r6}           // Load two words of pSrc1
+    ldmia   r1!, {r5, r7}           // Load two words of pSrc2
+    sbcs    r4, r4, r5
+    sbcs    r6, r6, r7
+    stmia   r2!, {r4, r6}           // Store the result in the destination
+
+    sbc     r8, r9, r9              // If borrow=1, then r8 = -1 = 0xffffffff
+
+    adds    r3, r3, #1              // Increment the digit count by one
+    bne     SymCryptFdefMontgomeryReduceRawSubAsmLoop
+
+    // Prepare the pointers for masked copy
+    mov     r0, r11                 // pSrc
+    ldr     r1, [sp, #pDst]         // pDst
+
+    and     r9, r8, #1              // If r8>0, set the return value to 1
+    orr     r11, r10, r9            // r11 = hc|d
+
+    ldr     r2, [sp, #nDigits]      // Restore the digit counter
+
+    mov     r9, #0                  // r9 = 0
+
+    subs    r4, r10, r11            // If (r11 > r10) clear the carry flag (i.e. borrow)
+    sbc     r3, r9, r9              // r3 = mask = 0xffffffff if the carry flag is clear
+    // orn     r9, r9, r3              // r9 = NOT(MASK) = 0 if r3 = 0xffffffff
+    orr     r9, r9, r3
+    mvn     r9, r9
+    mov     r8, r1                  // save the destination pointer
+
+SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop:
+    ldmia   r0!, {r4, r6}           // Load two words of the source
+    ldmia   r1!, {r5, r7}           // Load two words of the destination
+    and     r4, r4, r3
+    and     r5, r5, r9
+    orr     r4, r4, r5
+    and     r6, r6, r3
+    and     r7, r7, r9
+    orr     r6, r6, r7
+    stmia   r8!, {r4, r6}           // Store the two words in the destination
+
+    ldmia   r0!, {r4, r6}           // Load two words of the source
+    ldmia   r1!, {r5, r7}           // Load two words of the destination
+    and     r4, r4, r3
+    and     r5, r5, r9
+    orr     r4, r4, r5
+    and     r6, r6, r3
+    and     r7, r7, r9
+    orr     r6, r6, r7
+    stmia   r8!, {r4, r6}           // Store the two words in the destination
+
+    adds    r2, r2, #1              // Increment the digit count by one
+    bne     SymCryptFdefMontgomeryReduceMaskedCopyAsmLoop
+
+    // Done, no return value
+
+    add     sp, sp, #16
+//    pop          {r4-r12, pc}
+
+//    LEAF_END SymCryptFdefMontgomeryReduceAsm
+FUNCTION_END(SymCryptFdefMontgomeryReduceAsm)
--- a/lib/arm/wipe.symcryptasm
+++ b/lib/arm/wipe.symcryptasm
@ -0,0 +1,20 @@
+//  SymCryptWipe
+//
+//  Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+//  Secure wipe
+//
+
+
+// VOID
+// SYMCRYPT_CALL
+// SymCryptWipe( _Out_writes_bytes_( cbData )   PVOID  pbData,
+//                                        SIZE_T cbData )
+
+FUNCTION_START(SymCryptWipeAsm, 2, 0)
+        // we just jump to memset.
+        // this is enough to stop the compiler optimizing the memset away.
+        mov     r2, r1
+        mov     r1, #0
+        bl       memset
+FUNCTION_END(SymCryptWipeAsm)
--- a/lib/sha256Par.c
+++ b/lib/sha256Par.c
@ -825,48 +825,46 @@ SymCryptParallelSha256AppendBlocks_neon(
    //
    // This can probably be done faster, but we are missing the VTRN.64 instruction
    // which makes it hard to do this efficient in intrinsics.
-    // The vsetq_lane_u32 seems to be completely ignored by the compiler.
    //
-    ha[7].n128_u32[0] = pChain[0]->H[0];
-    ha[7].n128_u32[1] = pChain[1]->H[0];
-    ha[7].n128_u32[2] = pChain[2]->H[0];
-    ha[7].n128_u32[3] = pChain[3]->H[0];
-
-    ha[6].n128_u32[0] = pChain[0]->H[1];
-    ha[6].n128_u32[1] = pChain[1]->H[1];
-    ha[6].n128_u32[2] = pChain[2]->H[1];
-    ha[6].n128_u32[3] = pChain[3]->H[1];
-
-    ha[5].n128_u32[0] = pChain[0]->H[2];
-    ha[5].n128_u32[1] = pChain[1]->H[2];
-    ha[5].n128_u32[2] = pChain[2]->H[2];
-    ha[5].n128_u32[3] = pChain[3]->H[2];
-
-    ha[4].n128_u32[0] = pChain[0]->H[3];
-    ha[4].n128_u32[1] = pChain[1]->H[3];
-    ha[4].n128_u32[2] = pChain[2]->H[3];
-    ha[4].n128_u32[3] = pChain[3]->H[3];
-
-    ha[3].n128_u32[0] = pChain[0]->H[4];
-    ha[3].n128_u32[1] = pChain[1]->H[4];
-    ha[3].n128_u32[2] = pChain[2]->H[4];
-    ha[3].n128_u32[3] = pChain[3]->H[4];
-
-    ha[2].n128_u32[0] = pChain[0]->H[5];
-    ha[2].n128_u32[1] = pChain[1]->H[5];
-    ha[2].n128_u32[2] = pChain[2]->H[5];
-    ha[2].n128_u32[3] = pChain[3]->H[5];
-
-    ha[1].n128_u32[0] = pChain[0]->H[6];
-    ha[1].n128_u32[1] = pChain[1]->H[6];
-    ha[1].n128_u32[2] = pChain[2]->H[6];
-    ha[1].n128_u32[3] = pChain[3]->H[6];
-
-    ha[0].n128_u32[0] = pChain[0]->H[7];
-    ha[0].n128_u32[1] = pChain[1]->H[7];
-    ha[0].n128_u32[2] = pChain[2]->H[7];
-    ha[0].n128_u32[3] = pChain[3]->H[7];
-
+    ha[7] = vsetq_lane_u32( pChain[0]->H[0], ha[7], 0 );
+    ha[7] = vsetq_lane_u32( pChain[1]->H[0], ha[7], 1 );
+    ha[7] = vsetq_lane_u32( pChain[2]->H[0], ha[7], 2 );
+    ha[7] = vsetq_lane_u32( pChain[3]->H[0], ha[7], 3 );
+
+    ha[6] = vsetq_lane_u32( pChain[0]->H[1], ha[6], 0 );
+    ha[6] = vsetq_lane_u32( pChain[1]->H[1], ha[6], 1 );
+    ha[6] = vsetq_lane_u32( pChain[2]->H[1], ha[6], 2 );
+    ha[6] = vsetq_lane_u32( pChain[3]->H[1], ha[6], 3 );
+
+    ha[5] = vsetq_lane_u32( pChain[0]->H[2], ha[5], 0 );
+    ha[5] = vsetq_lane_u32( pChain[1]->H[2], ha[5], 1 );
+    ha[5] = vsetq_lane_u32( pChain[2]->H[2], ha[5], 2 );
+    ha[5] = vsetq_lane_u32( pChain[3]->H[2], ha[5], 3 );
+
+    ha[4] = vsetq_lane_u32( pChain[0]->H[3], ha[4], 0 );
+    ha[4] = vsetq_lane_u32( pChain[1]->H[3], ha[4], 1 );
+    ha[4] = vsetq_lane_u32( pChain[2]->H[3], ha[4], 2 );
+    ha[4] = vsetq_lane_u32( pChain[3]->H[3], ha[4], 3 );
+
+    ha[3] = vsetq_lane_u32( pChain[0]->H[4], ha[3], 0 );
+    ha[3] = vsetq_lane_u32( pChain[1]->H[4], ha[3], 1 );
+    ha[3] = vsetq_lane_u32( pChain[2]->H[4], ha[3], 2 );
+    ha[3] = vsetq_lane_u32( pChain[3]->H[4], ha[3], 3 );
+
+    ha[2] = vsetq_lane_u32( pChain[0]->H[5], ha[2], 0 );
+    ha[2] = vsetq_lane_u32( pChain[1]->H[5], ha[2], 1 );
+    ha[2] = vsetq_lane_u32( pChain[2]->H[5], ha[2], 2 );
+    ha[2] = vsetq_lane_u32( pChain[3]->H[5], ha[2], 3 );
+
+    ha[1] = vsetq_lane_u32( pChain[0]->H[6], ha[1], 0 );
+    ha[1] = vsetq_lane_u32( pChain[1]->H[6], ha[1], 1 );
+    ha[1] = vsetq_lane_u32( pChain[2]->H[6], ha[1], 2 );
+    ha[1] = vsetq_lane_u32( pChain[3]->H[6], ha[1], 3 );
+
+    ha[0] = vsetq_lane_u32( pChain[0]->H[7], ha[0], 0 );
+    ha[0] = vsetq_lane_u32( pChain[1]->H[7], ha[0], 1 );
+    ha[0] = vsetq_lane_u32( pChain[2]->H[7], ha[0], 2 );
+    ha[0] = vsetq_lane_u32( pChain[3]->H[7], ha[0], 3 );

    buf[0] = ha[4];
    buf[1] = ha[5];
@ -881,10 +879,10 @@ SymCryptParallelSha256AppendBlocks_neon(
        //
        for( r=0; r<16; r ++ )
        {
-            T0.n128_u32[0] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ); ppByte[0] += 4;
-            T0.n128_u32[1] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ); ppByte[1] += 4;
-            T0.n128_u32[2] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ); ppByte[2] += 4;
-            T0.n128_u32[3] = SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ); ppByte[3] += 4;
+            T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[0] ), T0, 0 ); ppByte[0] += 4;
+            T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[1] ), T0, 1 ); ppByte[1] += 4;
+            T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[2] ), T0, 2 ); ppByte[2] += 4;
+            T0 = vsetq_lane_u32( SYMCRYPT_LOAD_MSBFIRST32( ppByte[3] ), T0, 3 ); ppByte[3] += 4;
            W[r] = T0;
        }

@ -964,47 +962,46 @@ SymCryptParallelSha256AppendBlocks_neon(
    //
    // Copy the chaining state back into the hash structure
    //
-
-    pChain[0]->H[0] = ha[7].n128_u32[0];
-    pChain[1]->H[0] = ha[7].n128_u32[1];
-    pChain[2]->H[0] = ha[7].n128_u32[2];
-    pChain[3]->H[0] = ha[7].n128_u32[3];
-
-    pChain[0]->H[1] = ha[6].n128_u32[0];
-    pChain[1]->H[1] = ha[6].n128_u32[1];
-    pChain[2]->H[1] = ha[6].n128_u32[2];
-    pChain[3]->H[1] = ha[6].n128_u32[3];
-
-    pChain[0]->H[2] = ha[5].n128_u32[0];
-    pChain[1]->H[2] = ha[5].n128_u32[1];
-    pChain[2]->H[2] = ha[5].n128_u32[2];
-    pChain[3]->H[2] = ha[5].n128_u32[3];
-
-    pChain[0]->H[3] = ha[4].n128_u32[0];
-    pChain[1]->H[3] = ha[4].n128_u32[1];
-    pChain[2]->H[3] = ha[4].n128_u32[2];
-    pChain[3]->H[3] = ha[4].n128_u32[3];
-
-    pChain[0]->H[4] = ha[3].n128_u32[0];
-    pChain[1]->H[4] = ha[3].n128_u32[1];
-    pChain[2]->H[4] = ha[3].n128_u32[2];
-    pChain[3]->H[4] = ha[3].n128_u32[3];
-
-    pChain[0]->H[5] = ha[2].n128_u32[0];
-    pChain[1]->H[5] = ha[2].n128_u32[1];
-    pChain[2]->H[5] = ha[2].n128_u32[2];
-    pChain[3]->H[5] = ha[2].n128_u32[3];
-
-    pChain[0]->H[6] = ha[1].n128_u32[0];
-    pChain[1]->H[6] = ha[1].n128_u32[1];
-    pChain[2]->H[6] = ha[1].n128_u32[2];
-    pChain[3]->H[6] = ha[1].n128_u32[3];
-
-    pChain[0]->H[7] = ha[0].n128_u32[0];
-    pChain[1]->H[7] = ha[0].n128_u32[1];
-    pChain[2]->H[7] = ha[0].n128_u32[2];
-    pChain[3]->H[7] = ha[0].n128_u32[3];
-
+    pChain[0]->H[0] = vgetq_lane_u32( ha[7], 0 );
+    pChain[1]->H[0] = vgetq_lane_u32( ha[7], 1 );
+    pChain[2]->H[0] = vgetq_lane_u32( ha[7], 2 );
+    pChain[3]->H[0] = vgetq_lane_u32( ha[7], 3 );
+
+    pChain[0]->H[1] = vgetq_lane_u32( ha[6], 0 );
+    pChain[1]->H[1] = vgetq_lane_u32( ha[6], 1 );
+    pChain[2]->H[1] = vgetq_lane_u32( ha[6], 2 );
+    pChain[3]->H[1] = vgetq_lane_u32( ha[6], 3 );
+
+    pChain[0]->H[2] = vgetq_lane_u32( ha[5], 0 );
+    pChain[1]->H[2] = vgetq_lane_u32( ha[5], 1 );
+    pChain[2]->H[2] = vgetq_lane_u32( ha[5], 2 );
+    pChain[3]->H[2] = vgetq_lane_u32( ha[5], 3 );
+
+    pChain[0]->H[3] = vgetq_lane_u32( ha[4], 0 );
+    pChain[1]->H[3] = vgetq_lane_u32( ha[4], 1 );
+    pChain[2]->H[3] = vgetq_lane_u32( ha[4], 2 );
+    pChain[3]->H[3] = vgetq_lane_u32( ha[4], 3 );
+
+    pChain[0]->H[4] = vgetq_lane_u32( ha[3], 0 );
+    pChain[1]->H[4] = vgetq_lane_u32( ha[3], 1 );
+    pChain[2]->H[4] = vgetq_lane_u32( ha[3], 2 );
+    pChain[3]->H[4] = vgetq_lane_u32( ha[3], 3 );
+
+    pChain[0]->H[5] = vgetq_lane_u32( ha[2], 0 );
+    pChain[1]->H[5] = vgetq_lane_u32( ha[2], 1 );
+    pChain[2]->H[5] = vgetq_lane_u32( ha[2], 2 );
+    pChain[3]->H[5] = vgetq_lane_u32( ha[2], 3 );
+
+    pChain[0]->H[6] = vgetq_lane_u32( ha[1], 0 );
+    pChain[1]->H[6] = vgetq_lane_u32( ha[1], 1 );
+    pChain[2]->H[6] = vgetq_lane_u32( ha[1], 2 );
+    pChain[3]->H[6] = vgetq_lane_u32( ha[1], 3 );
+
+    pChain[0]->H[7] = vgetq_lane_u32( ha[0], 0 );
+    pChain[1]->H[7] = vgetq_lane_u32( ha[0], 1 );
+    pChain[2]->H[7] = vgetq_lane_u32( ha[0], 2 );
+    pChain[3]->H[7] = vgetq_lane_u32( ha[0], 3 );
+
    SymCryptWipeKnownSize( buf, sizeof( buf ) );
 }

--- a/lib/symcryptasm_shared.cppasm
+++ b/lib/symcryptasm_shared.cppasm
@ -37,7 +37,7 @@ include ksamd64.inc
 #define GET_SYMBOL_ADDRESS(__symbol)    __symbol@plt+rip
 #define HEX(__constant)                 0x##__constant
 #define TEXTAREA()
-#define EXTERN(__label)
+#define EXTERN(__label)                 .extern __label
 #define LABEL(__labelname)              __labelname:

 #else
--- a/modules/linux/common/ModuleCommon.cmake
+++ b/modules/linux/common/ModuleCommon.cmake
@ -19,7 +19,7 @@ set(KEEP_SYMBOL_ARGS
 )

 # Determine the which executable to use for stripping binaries
-if(SYMCRYPT_TARGET_ARCH MATCHES ARM64 AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
+if(SYMCRYPT_TARGET_ARCH MATCHES ARM AND NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ARM64|aarch64")
    set(STRIP_COMMAND ${TARGET_TRIPLE}-strip)
    set(OBJCOPY_COMMAND ${TARGET_TRIPLE}-objcopy)
 else()
--- a/modules/linux/common/integrity.c
+++ b/modules/linux/common/integrity.c
@ -8,8 +8,152 @@

 #include "precomp.h"

+#if UINTPTR_MAX == 0xFFFFFFFF
+#define Elf_Shdr          Elf32_Shdr
+#define Elf_Phdr          Elf32_Phdr
+#define Elf_Sym           Elf32_Sym
+#define Elf_Dyn           Elf32_Dyn
+#define Elf_Ehdr          Elf32_Ehdr
+#define Elf_Addr          Elf32_Addr
+#define Elf_Off           Elf32_Off
+#define Elf_Rel           Elf32_Rel
+#define Elf_Rela          Elf32_Rela
+#define ELF_R_TYPE(X)     ELF32_R_TYPE(X)
+#define ELF_R_SYM(X)      ELF32_R_SYM(X)
+#define Elf_Word          Elf32_Word
+#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ32
+#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
+#define Elf_Shdr          Elf64_Shdr
+#define Elf_Phdr          Elf64_Phdr
+#define Elf_Sym           Elf64_Sym
+#define Elf_Dyn           Elf64_Dyn
+#define Elf_Ehdr          Elf64_Ehdr
+#define Elf_Addr          Elf64_Addr
+#define Elf_Off           Elf64_Off
+#define Elf_Rel           Elf64_Rel
+#define Elf_Rela          Elf64_Rela
+#define ELF_R_TYPE(X)     ELF64_R_TYPE(X)
+#define ELF_R_SYM(X)      ELF64_R_SYM(X)
+#define Elf_Word          Elf64_Word
+#define SYMCRYPT_FORCE_READ_ADDR SYMCRYPT_FORCE_READ64
+#else
+#error Unknown CPU pointer size
+#endif
+
+#ifdef SYMCRYPT_DEBUG_INTEGRITY
+#include <stdio.h>
+
+VOID
+DbgDumpHex(PCBYTE pbData, SIZE_T cbData)
+{
+    ULONG i,count;
+    CHAR digits[]="0123456789abcdef";
+    CHAR pbLine[256];
+    ULONG cbLine, cbHeader = 0;
+    ULONG_PTR address;
+
+    if(pbData == NULL && cbData != 0)
+    {
+        // strcat_s(pbLine, RTL_NUMBER_OF(pbLine), "<null> buffer!!!\n");
+        fprintf(stderr, "<null> buffer!!!\n");
+        return;
+    }
+
+    for(; cbData ; cbData -= count, pbData += count)
+    {
+        count = (cbData > 16) ? 16:cbData;
+
+        cbLine = cbHeader;
+
+        address = (ULONG_PTR)pbData;
+
+#if UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
+        // 64 bit addresses.
+        pbLine[cbLine++] = digits[(address >> 0x3c) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x38) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x34) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x30) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x2c) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x28) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x24) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x20) & 0x0f];
+#endif
+        pbLine[cbLine++] = digits[(address >> 0x1c) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x18) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x14) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x10) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x0c) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x08) & 0x0f];
+        pbLine[cbLine++] = digits[(address >> 0x04) & 0x0f];
+        pbLine[cbLine++] = digits[(address        ) & 0x0f];
+        pbLine[cbLine++] = ' ';
+        pbLine[cbLine++] = ' ';
+
+        for(i = 0; i < count; i++)
+        {
+            pbLine[cbLine++] = digits[pbData[i]>>4];
+            pbLine[cbLine++] = digits[pbData[i]&0x0f];
+            if(i == 7)
+            {
+                pbLine[cbLine++] = ':';
+            }
+            else
+            {
+                pbLine[cbLine++] = ' ';
+            }
+        }
+
+        for(; i < 16; i++)
+        {
+            pbLine[cbLine++] = ' ';
+            pbLine[cbLine++] = ' ';
+            pbLine[cbLine++] = ' ';
+        }
+
+        pbLine[cbLine++] = ' ';
+
+        for(i = 0; i < count; i++)
+        {
+            if(pbData[i] < 32 || pbData[i] > 126)
+            {
+                pbLine[cbLine++] = '.';
+            }
+            else
+            {
+                pbLine[cbLine++] = pbData[i];
+            }
+        }
+
+        pbLine[cbLine++] = 0;
+
+        fprintf(stderr, "%s\n", pbLine);
+    }
+}
+
+VOID
+SYMCRYPT_CALL
+SymCryptHmacSha256AppendDbg(
+    _In_                    CHAR*                       pszLabel,
+    _Inout_                 PSYMCRYPT_HMAC_SHA256_STATE pState,
+    _In_reads_( cbData )    PCBYTE                      pbData,
+                            SIZE_T                      cbData )
+{
+    fprintf(stderr, "\nHMAC append: %s size %lx\n", pszLabel, cbData);
+    DbgDumpHexString(pbData, (ULONG)cbData);
+    SymCryptHmacSha256Append(pState, pbData, cbData);
+}
+
+#endif
+
 // These placeholder vaulues must match the values in process_fips_module.py
+#if UINTPTR_MAX == 0xFFFFFFFF
 #define PLACEHOLDER_VALUE 0x8BADF00D
+#elif UINTPTR_MAX == 0xFFFFFFFFFFFFFFFFu
+#define PLACEHOLDER_VALUE 0x4BADF00D8BADF00D
+#else
+#error Unknown CPU pointer size
+#endif
+
 #define PLACEHOLDER_ARRAY \
 {\
    0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,\
@ -29,11 +173,11 @@

 // Relative virtual address of the HMAC key. Used to calculate where the module starts in memory
 // at runtime.
-const Elf64_Addr SymCryptVolatileFipsHmacKeyRva = (Elf64_Addr) PLACEHOLDER_VALUE;
+const Elf_Addr SymCryptVolatileFipsHmacKeyRva = (Elf_Addr) PLACEHOLDER_VALUE;

 // Offset to the end of the FIPS module. Bytes after this offset are not considered part of our
 // FIPS module and are not included in the HMAC digest.
-const Elf64_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;
+const Elf_Off SymCryptVolatileFipsBoundaryOffset = PLACEHOLDER_VALUE;

 // Key used for HMAC.
 const unsigned char SymCryptVolatileFipsHmacKey[32] = PLACEHOLDER_ARRAY;
@ -44,32 +188,43 @@ unsigned char SymCryptVolatileFipsHmacDigest[SYMCRYPT_HMAC_SHA256_RESULT_SIZE] =

 typedef struct
 {
-    Elf64_Rela* rela;
+    Elf_Rela* rela;
+    Elf_Rel* rel;
    size_t relaEntryCount;
-    Elf64_Rela* pltRela;
+    size_t relEntryCount;
+    union {
+        Elf_Rela* rela;
+        Elf_Rel* rel;
+        Elf_Addr addr;
+    } plt;
    size_t pltRelaEntryCount;
-} Elf64_Rela_Info;
+    Elf_Addr pltRelAddendType;
+} Elf_Rela_Info;
+

 VOID SymCryptModuleUndoRelocation(
-    _In_ const Elf64_Addr module_base,
-    _Inout_ Elf64_Xword* const target,
-    _In_ const Elf64_Rela* rela )
+    _In_ const Elf_Addr module_base,
+    _Inout_ Elf_Addr* const target,
+    _In_ const Elf_Word relType )
 {
-    Elf64_Xword replacement = 0;
+    Elf_Addr replacement = 0;

-    switch( ELF64_R_TYPE( rela->r_info ) )
+    switch( relType )
    {
        case R_X86_64_RELATIVE:
        case R_AARCH64_RELATIVE:
-            replacement = *target - (Elf64_Off) module_base;
+        case R_ARM_RELATIVE:
+            replacement = *target - (Elf_Off) module_base;
            break;
        case R_X86_64_64:
        case R_X86_64_GLOB_DAT:
        case R_X86_64_JUMP_SLOT:
        case R_AARCH64_GLOB_DAT:
        case R_AARCH64_JUMP_SLOT:
-            // R_X86_64_64, R_X86_64_GLOB_DAT and R_AARCH64_GLOB_DAT relocations all have initial
-            // values of zero. R_X86_64_JUMP_SLOT and R_AARCH64_JUMP_SLOT relocations have initial
+        case R_ARM_GLOB_DAT:
+        case R_ARM_JUMP_SLOT:
+            // R_X86_64_64 and R_*_GLOB_DAT relocations all have initial
+            // values of zero. R_*_JUMP_SLOT relocations have initial
            // values that point into the PLT, but we set these to zero in our post-processing
            // script before HMACing the module. These relocation targets are excluded from our
            // FIPS module boundary because they're used for external function calls, which we
@ -87,36 +242,51 @@ VOID SymCryptModuleUndoRelocation(
 }

 VOID SymCryptModuleFindRelocationInfo(
-    _In_ const Elf64_Dyn* const dynStart,
-    _Out_ Elf64_Rela_Info* relaInfo)
+    _In_ const Elf_Dyn* const dynStart,
+    _Out_ Elf_Rela_Info* relaInfo)
 {
    relaInfo->rela = NULL;
    relaInfo->relaEntryCount = 0;
-    relaInfo->pltRela = NULL;
+    relaInfo->relEntryCount = 0;
+    relaInfo->plt.addr = 0;
    relaInfo->pltRelaEntryCount = 0;

    size_t relaTotalSize = 0;
+    size_t relTotalSize = 0;
    size_t relaEntrySize = 0;
+    size_t relEntrySize = 0;
    size_t pltTotalSize = 0;

-    for( const Elf64_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
+    for( const Elf_Dyn* dyn = dynStart; dyn->d_tag != DT_NULL; ++dyn )
    {
        switch( dyn->d_tag )
        {
            case DT_RELA:
-                relaInfo->rela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
+                relaInfo->rela = ( Elf_Rela* ) dyn->d_un.d_ptr;
+                break;
+
+            case DT_REL:
+                relaInfo->rel = ( Elf_Rel* ) dyn->d_un.d_ptr;
                break;

            case DT_RELASZ:
                relaTotalSize = dyn->d_un.d_val;
                break;

+            case DT_RELSZ:
+                relTotalSize = dyn->d_un.d_val;
+                break;
+
            case DT_RELAENT:
                relaEntrySize = dyn->d_un.d_val;
                break;

+            case DT_RELENT:
+                relEntrySize = dyn->d_un.d_val;
+                break;
+
            case DT_JMPREL:
-                relaInfo->pltRela = ( Elf64_Rela* ) dyn->d_un.d_ptr;
+                relaInfo->plt.addr = ( Elf_Addr ) dyn->d_un.d_ptr;
                break;

            case DT_PLTRELSZ:
@ -124,9 +294,8 @@ VOID SymCryptModuleFindRelocationInfo(
                break;

            case DT_PLTREL:
-                // Make sure PLT entries are DT_RELA entries and not DT_REL; we do not suppport
-                // the latter
-                SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA );
+                SYMCRYPT_FIPS_ASSERT( dyn->d_un.d_val == DT_RELA || dyn->d_un.d_val == DT_REL );
+                relaInfo->pltRelAddendType = dyn->d_un.d_val;
                break;

            default:
@ -134,26 +303,45 @@ VOID SymCryptModuleFindRelocationInfo(
        }
    }

-    SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL );
-    SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf64_Rela ) );
-    SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
+    // Need to have at least one type of relocations.
+    SYMCRYPT_FIPS_ASSERT( relaInfo->rela != NULL || relaInfo->rel != NULL );

-    relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
-
-    // On AMD64 there should not be a PLT section, because we can't currently handle AMD64 PLT
-    // relocations
-    if( relaInfo->pltRela != NULL)
+    if ( relaInfo->rela != NULL)
    {
-        SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 && pltTotalSize % sizeof( Elf64_Rela ) == 0 );
-        relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf64_Rela );
+        SYMCRYPT_FIPS_ASSERT( relaEntrySize == sizeof( Elf_Rela ) );
+        SYMCRYPT_FIPS_ASSERT( relaTotalSize != 0 && relaTotalSize % relaEntrySize == 0 );
+        relaInfo->relaEntryCount = relaTotalSize / relaEntrySize;
+    }
+    if ( relaInfo->rel != NULL )
+    {
+        SYMCRYPT_FIPS_ASSERT( relaInfo->rel != NULL );
+        SYMCRYPT_FIPS_ASSERT( relEntrySize == sizeof( Elf_Rel ) );
+        SYMCRYPT_FIPS_ASSERT( relTotalSize != 0 && relTotalSize % relEntrySize == 0 );
+        relaInfo->relEntryCount = relTotalSize / relEntrySize;
+    }
+
+    if( relaInfo->plt.addr != 0)
+    {
+        SYMCRYPT_FIPS_ASSERT( pltTotalSize != 0 );
+        // PLT relocations are either all rel or all rela.
+        if ( relaInfo->pltRelAddendType == DT_RELA )
+        {
+            SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rela ) == 0 );
+            relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rela );
+        }
+        else
+        {
+            SYMCRYPT_FIPS_ASSERT( pltTotalSize % sizeof( Elf_Rel ) == 0 );
+            relaInfo->pltRelaEntryCount = pltTotalSize / sizeof( Elf_Rel );
+        }
    }
 }

 size_t SymCryptModuleProcessSegmentWithRelocations(
-    _In_ const Elf64_Addr module_base,
-    _In_ const Elf64_Phdr* const programHeader,
-    _In_ const Elf64_Dyn* const dynStart,
-    _In_ const Elf64_Rela_Info* const relaInfo,
+    _In_ const Elf_Addr module_base,
+    _In_ const Elf_Phdr* const programHeader,
+    _In_ const Elf_Dyn* const dynStart,
+    _In_ const Elf_Rela_Info* const relaInfo,
    _Inout_ SYMCRYPT_HMAC_SHA256_STATE* hmacState )
 {
    // The segment that contains relocations consists of the following sections, in this order:
@ -169,10 +357,19 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
    // the module on disk is usually a different size than at runtime, so we cannot include it in
    // our HMAC either.
    //
+    // In arm (32 bit) relocations also exist in .text section so we'll do SymCryptModuleProcessSegmentWithRelocations
+    // on every section.
+    // 
    // FipsBoundaryOffset marks the start of the .data section, so we read from the start of the
    // segment up to that offset.
-    size_t hashableSectionSize = SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
-    Elf64_Addr segmentStart = module_base + programHeader->p_vaddr;
+    size_t hashableSectionSize = programHeader->p_filesz;
+    Elf_Addr segmentStart = module_base + programHeader->p_vaddr;
+
+    // If the data section is in this segment then exclude that from being hashed.
+    if( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
+    {
+        hashableSectionSize = SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) - programHeader->p_offset;
+    }

    BYTE* segmentCopy = SymCryptCallbackAlloc( hashableSectionSize );
    SYMCRYPT_FIPS_ASSERT( segmentCopy != NULL );
@ -184,63 +381,103 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
    // these relocations separately. We find the .dynamic section in the copied buffer based on
    // its offset from the start of the section, which is calculated by subtracting the address
    // of the start of the segment from the address of the .dynamic section in the segment.
-    Elf64_Off dynOffsetInBuffer = (Elf64_Addr) dynStart - (Elf64_Addr) segmentStart;
-    Elf64_Dyn* dynStartInBuffer = (Elf64_Dyn*) (segmentCopy + dynOffsetInBuffer);
+    Elf_Off dynOffsetInBuffer = (Elf_Addr) dynStart - (Elf_Addr) segmentStart;
+    Elf_Dyn* dynStartInBuffer = (Elf_Dyn*) (segmentCopy + dynOffsetInBuffer);

-    for( Elf64_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
+    // If this segment contains the dynamic section then we need to process the relocations in it.
+    if ((Elf_Addr)dynStart > segmentStart && (Elf_Addr)dynStart < segmentStart + hashableSectionSize)
    {
-        // The following types of .dynamic entries have the module's base address added to
-        // their initial value
-        if( dyn->d_tag == DT_HASH ||
-            dyn->d_tag == DT_STRTAB ||
-            dyn->d_tag == DT_SYMTAB ||
-            dyn->d_tag == DT_RELA ||
-            dyn->d_tag == DT_GNU_HASH ||
-            dyn->d_tag == DT_VERSYM ||
-            dyn->d_tag == DT_PLTGOT ||
-            dyn->d_tag == DT_JMPREL)
+        for( Elf_Dyn* dyn = dynStartInBuffer; dyn->d_tag != DT_NULL; ++dyn )
        {
-            dyn->d_un.d_val -= (Elf64_Xword) module_base;
+            // The following types of .dynamic entries have the module's base address added to
+            // their initial value
+            if( dyn->d_tag == DT_HASH ||
+                dyn->d_tag == DT_STRTAB ||
+                dyn->d_tag == DT_SYMTAB ||
+                dyn->d_tag == DT_RELA ||
+                dyn->d_tag == DT_GNU_HASH ||
+                dyn->d_tag == DT_VERSYM ||
+                dyn->d_tag == DT_PLTGOT ||
+                dyn->d_tag == DT_JMPREL ||
+                dyn->d_tag == DT_REL)
+            {
+                dyn->d_un.d_val -= module_base;
+            }
        }
    }

    // Now we can process the normal relocations listed in the relocation table
    for( size_t i = 0; i < relaInfo->relaEntryCount; ++i )
    {
-        const Elf64_Rela* rela = relaInfo->rela + i;
+        const Elf_Rela* rela = relaInfo->rela + i;

        // Find the relocation within the section. Note that for a shared object module,
        // rela->r_offset is actually a virtual address. Relocations can occur within the .data
        // section, which is outside our FIPS boundary, so any such relocations can be ignored.
-        Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
+        Elf_Off offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
        if( offsetInBuffer > hashableSectionSize )
        {
            continue;
        }

-        Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
+        Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);

-        SymCryptModuleUndoRelocation( module_base, target, rela );
+        SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rela->r_info ) );
    }

-    // Process the GOT entries from the .rela.plt section. Same as process above, just
+    for( size_t i = 0; i < relaInfo->relEntryCount; ++i )
+    {
+        const Elf_Rel* rel = relaInfo->rel + i;
+
+        // Find the relocation within the section. Note that for a shared object module,
+        // rela->r_offset is actually a virtual address. Relocations can occur within the .data
+        // section, which is outside our FIPS boundary, so any such relocations can be ignored.
+        Elf_Off offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
+        if( offsetInBuffer > hashableSectionSize )
+        {
+            continue;
+        }
+
+        Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);
+
+        SymCryptModuleUndoRelocation( module_base, target, ELF_R_TYPE( rel->r_info ) );
+    }
+
+    // Process the GOT entries from the .rela.plt or .rel.plt section. Same as process above, just
    // with a different table.
    for( size_t i = 0; i < relaInfo->pltRelaEntryCount; ++i)
    {
-        const Elf64_Rela* rela = relaInfo->pltRela + i;
+        Elf_Word type = 0;
+        Elf_Off offsetInBuffer = 0;
+
+        if (relaInfo->pltRelAddendType == DT_RELA)
+        {
+            const Elf_Rela* rela = relaInfo->plt.rela + i;
+            type = ELF_R_TYPE( rela->r_info );
+            offsetInBuffer = (Elf_Off) rela->r_offset - (Elf_Off) programHeader->p_vaddr;
+        }
+        else
+        {
+            const Elf_Rel* rel = relaInfo->plt.rel + i;
+            type = ELF_R_TYPE( rel->r_info );
+            offsetInBuffer = (Elf_Off) rel->r_offset - (Elf_Off) programHeader->p_vaddr;
+        }

-        Elf64_Off offsetInBuffer = (Elf64_Off) rela->r_offset - (Elf64_Off) programHeader->p_vaddr;
        if( offsetInBuffer > hashableSectionSize )
        {
            continue;
        }

-        Elf64_Xword* target = (Elf64_Xword*) ( segmentCopy + offsetInBuffer);
+        Elf_Addr* target = (Elf_Addr*) ( segmentCopy + offsetInBuffer);

-        SymCryptModuleUndoRelocation( module_base, target, rela );
+        SymCryptModuleUndoRelocation( module_base, target, type );
    }

+#if SYMCRYPT_DEBUG_INTEGRITY
+    SymCryptHmacSha256AppendDbg( "Append after relocation adjust", hmacState, segmentCopy, hashableSectionSize );
+#else 
    SymCryptHmacSha256Append( hmacState, segmentCopy, hashableSectionSize );
+#endif

    SymCryptCallbackFree( segmentCopy );

@ -248,9 +485,9 @@ size_t SymCryptModuleProcessSegmentWithRelocations(
 }

 VOID SymCryptModuleDoHmac(
-    _In_ const Elf64_Addr module_base,
-    _In_ const Elf64_Dyn* const dynStart,
-    _In_ const Elf64_Rela_Info* const relaInfo )
+    _In_ const Elf_Addr module_base,
+    _In_ const Elf_Dyn* const dynStart,
+    _In_ const Elf_Rela_Info* const relaInfo )
 {
    SYMCRYPT_ERROR scError = SYMCRYPT_NO_ERROR;
    SYMCRYPT_HMAC_SHA256_EXPANDED_KEY hmacKey;
@ -263,37 +500,20 @@ VOID SymCryptModuleDoHmac(

    SymCryptHmacSha256Init( &hmacState, &hmacKey );

-    const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
-    const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
+    const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
+    const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );

-    for( const Elf64_Phdr* programHeader = programHeaderStart;
+    for( const Elf_Phdr* programHeader = programHeaderStart;
        programHeader->p_type == PT_LOAD; ++programHeader )
    {
        // Sometimes the virtual address of a segment is greater than its offset into the module
        // file on disk. This means extra NULL bytes will be inserted into the module's memory
        // space at runtime. Those bytes are not part of our FIPS boundary, so we skip over them
        // and always start reading from the segment's virtual address
-        Elf64_Addr segmentStart = module_base + (Elf64_Off) programHeader->p_vaddr;
+        Elf_Addr segmentStart = module_base + (Elf_Off) programHeader->p_vaddr;

-        if( (programHeader->p_flags & PF_W) == PF_W &&
-            SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) <= programHeader->p_offset + programHeader->p_filesz )
-        {
-            // If we are processing the final writable segment (containing the .data section which
-            // marks the end of our FIPS boundary), then we need to reverse relocations in it
-            SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
+        SymCryptModuleProcessSegmentWithRelocations( module_base, programHeader, dynStart,
                relaInfo, &hmacState );
-        }
-        else
-        {
-            // For AMD64/ARM64, non-writeable segments do not contain relocations, so we can write
-            // them in their entirety without modification. Note that the size in memory of the
-            // section may be larger than the size on disk, but again, the additional size in memory
-            // is not part of our FIPS boundary
-            // For now we assume that if there are writable segments before the final writable
-            // segment that they also contain no relocations
-            SymCryptHmacSha256Append( &hmacState, (PCBYTE) segmentStart,
-                programHeader->p_filesz );
-        }
    }

    SymCryptHmacSha256Result( &hmacState, actualDigest );
@ -307,32 +527,32 @@ VOID SymCryptModuleVerifyIntegrity(void)
 {
    // Verify that our placeholder values were modified after compile time. The build script
    // should have replaced the placeholder values with their expected values
-    SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
-    SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );
+    SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva ) != PLACEHOLDER_VALUE );
+    SYMCRYPT_FIPS_ASSERT( SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsBoundaryOffset ) != PLACEHOLDER_VALUE );

-    const Elf64_Addr module_base = (Elf64_Addr) SymCryptVolatileFipsHmacKey -
-        SYMCRYPT_FORCE_READ64( &SymCryptVolatileFipsHmacKeyRva );
+    const Elf_Addr module_base = (Elf_Addr) SymCryptVolatileFipsHmacKey -
+        SYMCRYPT_FORCE_READ_ADDR( &SymCryptVolatileFipsHmacKeyRva );

-    const Elf64_Ehdr* header = (Elf64_Ehdr*) module_base;
+    const Elf_Ehdr* header = (Elf_Ehdr*) module_base;
    SYMCRYPT_FIPS_ASSERT( memcmp(header->e_ident.ident.magic, ElfMagic, sizeof(ElfMagic)) == 0 );
    SYMCRYPT_FIPS_ASSERT( header->e_type == ET_DYN );
-    SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 );
+    SYMCRYPT_FIPS_ASSERT( header->e_machine == EM_X86_64 || header->e_machine == EM_AARCH64 || header->e_machine == EM_ARM );
    SYMCRYPT_FIPS_ASSERT( header->e_version == EV_CURRENT );
-    SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf64_Ehdr) );
-    SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf64_Phdr) );
+    SYMCRYPT_FIPS_ASSERT( header->e_ehsize == sizeof(Elf_Ehdr) );
+    SYMCRYPT_FIPS_ASSERT( header->e_phentsize == sizeof(Elf_Phdr) );

-    const Elf64_Phdr* programHeaderStart = (Elf64_Phdr*) ( module_base + header->e_phoff );
+    const Elf_Phdr* programHeaderStart = (Elf_Phdr*) ( module_base + header->e_phoff );

-    Elf64_Rela_Info relaInfo = {};
+    Elf_Rela_Info relaInfo = {};

-    Elf64_Dyn* dynStart = NULL;
+    Elf_Dyn* dynStart = NULL;

    for( unsigned int i = 0; i < header->e_phnum; ++i )
    {
-        const Elf64_Phdr* programHeader = programHeaderStart + i;
+        const Elf_Phdr* programHeader = programHeaderStart + i;
        if( programHeader->p_type == PT_DYNAMIC )
        {
-            dynStart = (Elf64_Dyn*) (module_base + (Elf64_Off) programHeader->p_vaddr);
+            dynStart = (Elf_Dyn*) (module_base + (Elf_Off) programHeader->p_vaddr);

            SymCryptModuleFindRelocationInfo( dynStart, &relaInfo );

--- a/modules/linux/common/integrity.h
+++ b/modules/linux/common/integrity.h
@ -1,21 +1,22 @@
-//
-// integrity.h
-// FIPS 140-3 integrity verification header for ELF binaries
-//
-// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
-//
-
-VOID SymCryptModuleVerifyIntegrity(void);
-//
-// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
-// HMAC-SHA256. The module must have been postprocessed after compilation using
-// process_fips_module.py. The integrity check finds the module's base address in memory by
-// subtracting the relative virtual address of a known variable from its actual address in memory.
-// It then uses the ELF header to find all the loadable segments in the module and calculate the
-// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
-// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
-// match the contents of the file on disk prior to relocation.
-//
-// If the integrity check fails for any reason, the module will fastfail, crashing the process,
-// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
-// 
+//
+// integrity.h
+// FIPS 140-3 integrity verification header for ELF binaries
+//
+// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+//
+
+VOID SymCryptModuleVerifyIntegrity(void);
+//
+// This function verifies the integrity of the loadable segments of the SymCrypt ELF module using
+// HMAC-SHA256. The module must have been postprocessed after compilation using
+// process_fips_module.py. The integrity check finds the module's base address in memory by
+// subtracting the relative virtual address of a known variable from its actual address in memory.
+// It then uses the ELF header to find all the loadable segments in the module and calculate the
+// HMAC-SHA256 digest of these segments. For writeable segments which are subject to relocations,
+// the relocations will be reversed prior to being added to the HMAC, so that the HMAC input will
+// match the contents of the file on disk prior to relocation.
+//
+// If the integrity check fails for any reason, the module will fastfail, crashing the process,
+// since a failed integrity check means it cannot operate in compliance with FIPS 140-3.
+//
+
--- a/modules/linux/generic/CMakeLists.txt
+++ b/modules/linux/generic/CMakeLists.txt
@ -3,8 +3,8 @@ set(SOURCES
    ../common/optional/rngforkdetection.c
    ../common/optional/rngsecureurandom.c)

-# Enable integrity verification if compiling for AMD64 or ARM64
-if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM64")
+# Enable integrity verification if compiling for AMD64 or ARM64 or ARM
+if(SYMCRYPT_TARGET_ARCH MATCHES "AMD64|ARM")
    list(APPEND SOURCES ../common/integrity.c)
 else()
    list(APPEND SOURCES ../common/nointegrity.c)
--- a/scripts/build.py
+++ b/scripts/build.py
@ -15,7 +15,7 @@ import subprocess
 import sys
 from typing import List

-ARCH_CMAKE = ("x86", "amd64", "arm64")
+ARCH_CMAKE = ("x86", "amd64", "arm64", "arm")
 CONFIG_CMAKE = ("Debug", "Release", "Sanitize")

 ARCH_MSBUILD = ("x86", "amd64", "arm64")
@ -35,8 +35,8 @@ def get_normalized_host_arch() -> str:
        normalized_arch = "amd64"
    elif re.fullmatch("ARM64|aarch64", host_arch):
        normalized_arch = "arm64"
-
-    # No support for ARM32 right now
+    elif re.fullmatch("ARM32|aarch32", host_arch):
+        normalized_arch = "arm"

    if not normalized_arch:
        print("Unrecognized host architecture " + host_arch, file = sys.stderr)
@ -82,7 +82,8 @@ def configure_cmake(args : argparse.Namespace) -> None:
            cmake_args.append("x64")
        elif args.arch == "arm64":
            cmake_args.append("arm64")
-        
+        elif args.arch == "arm":
+            cmake_args.append("arm")
        # No support for ARM32 right now
    
    if args.host_arch != args.arch:
--- a/scripts/package.py
+++ b/scripts/package.py
@ -197,7 +197,7 @@ def main() -> None:

    parser = argparse.ArgumentParser(description = "Packaging helper script for SymCrypt.")
    parser.add_argument("build_dir", type = pathlib.Path, help = "Build output directory.")
-    parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64"))
+    parser.add_argument("arch", type = str.lower, help = "Architecture of the binaries to package (for inclusion in the package name).", choices = ("x86", "amd64", "arm64", "arm"))
    parser.add_argument("config", type = str, help = "Build configuration.", choices = ["Debug", "Release", "Sanitize"])
    parser.add_argument("module_name", type = str, help = "Name of the module to package.")
    parser.add_argument("release_dir", type = pathlib.Path, help = "Directory to place the release in.")
--- a/scripts/process_fips_module.py
+++ b/scripts/process_fips_module.py
@ -21,7 +21,7 @@ import struct

 from elftools.elf.constants import P_FLAGS
 from elftools.elf.elffile import ELFFile
-from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64
+from elftools.elf.enums import ENUM_RELOC_TYPE_x64, ENUM_RELOC_TYPE_AARCH64, ENUM_RELOC_TYPE_ARM

 # Names of global constants in the FIPS module that need to be replaced
 KEY_NAME = "SymCryptVolatileFipsHmacKey"
@ -33,9 +33,27 @@ DIGEST_NAME = "SymCryptVolatileFipsHmacDigest"
 CHAR_FORMAT_SPECIFIER = "s"
 QWORD_FORMAT_SPECIFIER = "Q"
 QWORD_BYTE_SIZE = struct.calcsize(QWORD_FORMAT_SPECIFIER)
+DWORD_FORMAT_SPECIFIER = "I"
+DWORD_BYTE_SIZE = struct.calcsize(DWORD_FORMAT_SPECIFIER)
+
+RELOCATION_TYPE_SIZES = {
+    ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]: {
+        'size': QWORD_BYTE_SIZE,
+        'format': QWORD_FORMAT_SPECIFIER,
+    },
+    ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]: {
+        'size': QWORD_BYTE_SIZE,
+        'format': QWORD_FORMAT_SPECIFIER,
+    },
+    ENUM_RELOC_TYPE_ARM["R_ARM_JUMP_SLOT"]: {
+        'size': DWORD_BYTE_SIZE,
+        'format': DWORD_FORMAT_SPECIFIER,
+    },
+}

 # Must match the placeholder values in integrity.c
-PLACEHOLDER_VALUE = struct.pack(QWORD_FORMAT_SPECIFIER, 0x8BADF00D)
+PLACEHOLDER_VALUE_64BIT = struct.pack(QWORD_FORMAT_SPECIFIER, 0x4BADF00D8BADF00D)
+PLACEHOLDER_VALUE_32BIT = struct.pack(DWORD_FORMAT_SPECIFIER, 0x8BADF00D)
 PLACEHOLDER_ARRAY = bytes((
    0x5B, 0x75, 0xBB, 0xE4, 0x9E, 0x18, 0x03, 0x55,
    0x08, 0x4E, 0x3F, 0xE7, 0x60, 0x7E, 0x4F, 0x08,
@ -136,7 +154,8 @@ class ElfFileValueProxy(object):
        # The .data() method returns a bytes object. We can't use it to write back to the original
        # buffer, so we need to find the appropriate section within the stream using sh_offset and
        # write to that.
-        logging.debug("Writing {} to offset {}".format(value.hex(), hex(self.offset)))
+        # Note self.section.stream == self.elf_file.stream.
+        logging.debug("Changing {} writing {} to offset {}".format(self.name, value.hex(), hex(self.offset)))
        self.section.stream.seek(self.offset)
        self.section.stream.write(value)

@ -145,7 +164,7 @@ class ElfFileValueProxy(object):
        assert(len(new_value) == self.length)

        if self.name is not None:
-            logging.debug("Changing {} value".format(self.name))
+            logging.debug("Changing {} value to {}".format(self.name, *args))

        self.value = new_value

@ -156,6 +175,36 @@ def log_value(var):
        hex(var.vaddr),
        var.value.hex()))

+def dbg_dump_hex(data, address=0, file=None):
+    digits = "0123456789abcdef"
+    char_per_line = 16
+    remaining = len(data)
+    for line_pos in range(0, len(data), char_per_line):
+        chars_line = char_per_line if remaining > char_per_line else remaining
+        line = "{:08x}  ".format(address)
+        address += chars_line
+
+        for i in range(char_per_line):
+            if i < chars_line:
+                line += "{:02x}".format(data[line_pos + i])
+                if i == 7:
+                    line += ":"
+                else:
+                    line += " "
+            else:
+                line += "   "
+
+        line += " "
+
+        for i in range(chars_line):
+            if data[line_pos + i] < 32 or data[line_pos + i] > 126:
+                line += "."
+            else:
+                line += chr(data[line_pos + i])
+        print(line, file=file)
+        remaining -= char_per_line
+
+
 def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_path = None):
    """
    Performs HMAC-SHA256 on module contents and writes it back to the module buffer
@ -164,6 +213,10 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
    module_bytes = bytearray()
    last_segment_offset = -1

+    dump_file = None
+    if dump_file_path is not None:
+        dump_file = open(dump_file_path + '.txt', "w")
+
    for (index, segment) in enumerate(loadable_segments):

        segment_hashable_length = 0
@ -175,9 +228,13 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p
        if segment["p_offset"] + segment["p_filesz"] > data_section_offset:
            segment_hashable_length = data_section_offset - segment["p_offset"]
            module_bytes += segment.data()[:segment_hashable_length]
+            print("\nHMAC append: off {:08x} past data segment size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
        else:
            module_bytes += segment.data()
            segment_hashable_length = len(segment.data())
+            print("\nHMAC append: off {:08x} normal size {:x}".format(segment['p_offset'], segment_hashable_length), file=dump_file)
+
+        dbg_dump_hex(segment.data()[:segment_hashable_length], file=dump_file)

        logging.info("Segment {}: {} - {}".format(
            index,
@ -186,7 +243,8 @@ def hmac_module(loadable_segments, data_section_offset, key, digest, dump_file_p

        last_segment_offset = segment["p_offset"]

-    if dump_file_path is not None:
+    if dump_file is not None:
+        dump_file.close()
        with open(dump_file_path, "wb") as dump_file:
            dump_file.write(module_bytes)

@ -265,29 +323,34 @@ def overwrite_jump_slots(elf_file, new_value):
    (vaddr, original value), so that they can be reset after the HMAC digest is calculated.
    """

-    rela_plt_section = elf_file.get_section_by_name(".rela.plt")

-    # Jump slot relocations always live in the .rela.plt section. If there is no .rela.plt section,
-    # there will be no jump slot relocations
-    if rela_plt_section is None:
-        return []

    original_jump_slot_values = []
-    for relocation in rela_plt_section.iter_relocations():
+
+    # Jump slot relocations live in .rela.plt or .rel.plt sections.
+    rela_plt_section = elf_file.get_section_by_name(".rela.plt")
+    rel_plt_section = elf_file.get_section_by_name(".rel.plt")
+    relocations = []
+    if rela_plt_section is not None:
+        relocations += rela_plt_section.iter_relocations()
+    if rel_plt_section is not None:
+        relocations += rel_plt_section.iter_relocations()
+
+    for relocation in relocations:
        relocation_type = relocation["r_info_type"]
-        if (relocation_type == ENUM_RELOC_TYPE_x64["R_X86_64_JUMP_SLOT"]
-            or relocation_type == ENUM_RELOC_TYPE_AARCH64["R_AARCH64_JUMP_SLOT"]):
-
+        logging.debug("Found relocation type {}".format(relocation_type))
+        if relocation_type in RELOCATION_TYPE_SIZES:
            # Note that r_offset is actually a virtual address
-            relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"], QWORD_BYTE_SIZE)
+            relocation_value = ElfFileValueProxy.from_vaddr(elf_file, relocation["r_offset"],
+                                                            RELOCATION_TYPE_SIZES[relocation_type]["size"])

-            original_value_int = struct.unpack(QWORD_FORMAT_SPECIFIER, relocation_value.value)[0]
-            original_jump_slot_values.append((relocation_value.vaddr, original_value_int))
+            original_value_int = struct.unpack(RELOCATION_TYPE_SIZES[relocation_type]["format"], relocation_value.value)[0]
+            original_jump_slot_values.append((relocation_value.vaddr, original_value_int, relocation_type))

            logging.debug("Updating relocation at {} with original value {}".format(
                hex(relocation_value.offset), hex(original_value_int)))

-            relocation_value.set_value(QWORD_FORMAT_SPECIFIER, new_value)
+            relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], new_value)
        else:
            logging.warning("Unknown relocation type {} found at offset {}".format(
                relocation_type, relocation["r_offset"]))
@ -301,13 +364,14 @@ def reset_jump_slots(elf_file, original_jump_slot_values):
    lazy binding still works.
    """

-    for vaddr, original_value in original_jump_slot_values:
-        relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr, QWORD_BYTE_SIZE)
+    for vaddr, original_value, relocation_type in original_jump_slot_values:
+        relocation_value = ElfFileValueProxy.from_vaddr(elf_file, vaddr,
+                                                        RELOCATION_TYPE_SIZES[relocation_type]["size"])

        logging.debug("Resetting relocation at {} to original value {}".format(
                hex(vaddr), hex(original_value)))

-        relocation_value.set_value(QWORD_FORMAT_SPECIFIER, original_value)
+        relocation_value.set_value(RELOCATION_TYPE_SIZES[relocation_type]["format"], original_value)

 def main():
    """
@ -335,7 +399,7 @@ def main():
    elf_file = ELFFile(buffer_stream)

    arch = elf_file["e_machine"]
-    if not arch == "EM_X86_64" and not arch == "EM_AARCH64":
+    if not arch == "EM_X86_64" and not arch == "EM_AARCH64" and not arch == "EM_ARM":
        logging.error("Unsupported architecture {}".format(arch))
        raise RuntimeError

@ -354,20 +418,32 @@ def main():
    # Find the HMAC key relative virtual address placeholder and replace it with
    # the actual relative virtual address of the HMAC key
    key_rva_variable = ElfFileValueProxy.from_symbol_name(elf_file, KEY_RVA_NAME)
-    assert(key_rva_variable.value == PLACEHOLDER_VALUE)
+    if arch == "EM_ARM":
+        assert(key_rva_variable.value == PLACEHOLDER_VALUE_32BIT)
+    else:
+        assert(key_rva_variable.value == PLACEHOLDER_VALUE_64BIT)

-    key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
+    if arch == "EM_ARM":
+        key_rva_variable.set_value(DWORD_FORMAT_SPECIFIER, key_variable.vaddr)
+    else:
+        key_rva_variable.set_value(QWORD_FORMAT_SPECIFIER, key_variable.vaddr)
    log_value(key_rva_variable)

    # Find the FIPS module boundary placeholder and replace it with the our actual
    # FIPS module boundary, which we have defined to be the start of the .data section
    fips_boundary_variable = ElfFileValueProxy.from_symbol_name(elf_file, BOUNDARY_OFFSET_NAME)
-    assert(fips_boundary_variable.value == PLACEHOLDER_VALUE)
+    if arch == "EM_ARM":
+        assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_32BIT)
+    else:
+        assert(fips_boundary_variable.value == PLACEHOLDER_VALUE_64BIT)

    data_section = elf_file.get_section_by_name(".data")
    data_section_offset = data_section["sh_offset"]

-    fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
+    if arch == "EM_ARM":
+        fips_boundary_variable.set_value(DWORD_FORMAT_SPECIFIER, data_section_offset)
+    else:
+        fips_boundary_variable.set_value(QWORD_FORMAT_SPECIFIER, data_section_offset)
    log_value(fips_boundary_variable)

    # Find the HMAC digest placeholder, HMAC the loadable segments of the module, and replace
--- a/scripts/symcryptasm_processor.py
+++ b/scripts/symcryptasm_processor.py
@ -20,9 +20,9 @@ appropriate to enable this effort.
 Normally the processing of symcryptasm files takes place in 2 passes. The first pass is performed by
 this symcryptasm_processor.py script, which does the more stateful processing, outputting a .cppasm
 file. If the processed symcryptasm file includes other files via the INCLUDE directive, the contents
-of the included files are merged at their point of inclusion to generate a single expanded symcryptasm 
-file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which 
-do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to 
+of the included files are merged at their point of inclusion to generate a single expanded symcryptasm
+file which is saved with a .symcryptasmexp extension to the output folder. For symcryptasm files which
+do not include other files, there's no corresponding .symcryptasmexp file as it would be identical to
 the source file.

 The .cppasm files are further processed by the C preprocessor to do more simple stateless text
@ -78,7 +78,7 @@ A leaf function ends with the FUNCTION_END macro, which also takes the function

 At the function start a prologue is generated which arranges the arguments appropriately in
 registers, and saves non-volatile registers that have been requested to be used.
-At the function end an epilogue is generated with restores the non-volatile registers and returns.
+At the function end an epilogue is generated which restores the non-volatile registers and returns.


 A nested function (a function which does call another function) is specified similarly, only using
@ -112,14 +112,14 @@ prologue. The MUL_FUNCTION_START and MUL_FUNCTION_END macros are used in this ca
    We currently do not support nested mul functions, as we have none of them.

 Stack layout for amd64 is as follows. Xmm registers are volatile and not saved on Linux.
-                         
+
            Memory          Exists if
    |-------------------|
-    |                   | 
-    |   Shadow space    | 
+    |                   |
+    |   Shadow space    |
    |                   |
    |-------------------|
-    |   Return address  | 
+    |   Return address  |
    |-------------------|
    |   Non-volatile    |
    |   general purpose |   reg_count > volatile_registers
@ -149,6 +149,11 @@ currently support, these registers are volatile so do not need any special handl
 X_0 is always the result register and the first argument passed to the function.
 X_1-X_7 are the arguments 2-8 passed to the function

+
+### arm (32) ###
+We allow the registers r0-r12 to be addressed as r13-r15 are special registers that we cannot use as general purpose registers.
+As r12 is volatile in a leaf function, it should be used in preference to r4, to avoid spilling/restoring a register.
+
 """

 import re
@ -216,6 +221,24 @@ ARM64_R28 = Register("x28", "w28")
 ARM64_R29 = Register("x29", "w29") # Frame Pointer
 ARM64_R30 = Register("x30", "w30") # Link Register

+# arm32 registers
+ARM32_R0  = Register(None,  "r0")
+ARM32_R1  = Register(None,  "r1")
+ARM32_R2  = Register(None,  "r2")
+ARM32_R3  = Register(None,  "r3")
+ARM32_R4  = Register(None,  "r4")
+ARM32_R5  = Register(None,  "r5")
+ARM32_R6  = Register(None,  "r6")
+ARM32_R7  = Register(None,  "r7")
+ARM32_R8  = Register(None,  "r8")
+ARM32_R9  = Register(None,  "r9")
+ARM32_R10 = Register(None, "r10")
+ARM32_R11 = Register(None, "r11")
+ARM32_R12 = Register(None, "r12")
+ARM32_R13 = Register(None, "r13")
+ARM32_R14 = Register(None, "r14")
+ARM32_R15 = Register(None, "r15")
+
 class CallingConvention:
    """A class to represent calling conventions"""

@ -306,7 +329,7 @@ def calc_amd64_stack_allocation_sizes(self, reg_count, stack_alloc_size, xmm_reg
        aligned_on_16B = True

    # Calculate the space needed to save Xmm registers on the stack
-    saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)    
+    saved_reg_xmm = 0 if xmm_reg_count <= 6 else (xmm_reg_count - 6)
    xmm_save_size = 16 * saved_reg_xmm
    if xmm_save_size > 0 and not aligned_on_16B:
        xmm_save_size += 8
@ -337,7 +360,7 @@ def gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_re

    prologue = "\n"

-    # Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space. 
+    # Calculate the sizes of the buffers needed for saving registers, local variable buffer and shadow space.
    # Each of the sections other than general purpose registers are aligned on 16B boundary and some of them
    # may include an 8B padding in their size.
    reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
@ -380,12 +403,12 @@ def gen_prologue_amd64_msft_nested(self, arg_count, reg_count, stack_alloc_size,
    return gen_prologue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)

 def gen_epilogue_amd64_msft(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
-    
+
    epilogue = "\n"

    reg_save_size, xmm_save_size, stack_alloc_aligned_size, shadow_space_allocation_size = calc_amd64_stack_allocation_sizes(
        self, reg_count, stack_alloc_size, xmm_reg_count, nested)
-    
+
    # Restore non-volatile Xmm registers
    if xmm_save_size > 0:
        for i in range(6, xmm_reg_count):
@ -454,7 +477,7 @@ MAPPING_AMD64_SYSTEMV = {
 }

 def gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = False):
-    
+
    # Calculate the sizes required for each section
    # We need to call with xmm_reg_count=0 to avoid allocation/alignment for saving Xmm registers since they're
    # volatile for this calling convention.
@ -491,7 +514,7 @@ def gen_prologue_amd64_systemv_nested(self, arg_count, reg_count, stack_alloc_si
    return gen_prologue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, mul_fixup = "", nested = True)

 def gen_epilogue_amd64_systemv(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested = False):
-    
+
    epilogue = ""

    # Calculate the sizes required for each section
@ -601,6 +624,27 @@ MAPPING_ARM64_ARM64ECMSFT = {
    # R28 is reserved in ARM64EC
 }

+# ARM32 calling convention
+# A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP (and r9 in PCS variants that designate r9 as v6).
+MAPPING_ARM32_AAPCS32 = {
+    0: ARM32_R0,  # Argument 1 / Result register / volatile
+    1: ARM32_R1,  # Argument 2 / Result register / volatile
+    2: ARM32_R2,  # Argument 3 / volatile
+    3: ARM32_R3,  # Argument 4 / volatile
+    4: ARM32_R4,  # non-volatile
+    5: ARM32_R5,  # non-volatile
+    6: ARM32_R6,  # non-volatile
+    7: ARM32_R7,  # non-volatile
+    8: ARM32_R8,  # non-volatile
+    9: ARM32_R9,  # reserved for something
+    10:ARM32_R10, # non-volatile
+    11:ARM32_R11, # FP non-volatile
+    12:ARM32_R12, # volatile for leaf functions
+    13:ARM32_R13, # SP
+    14:ARM32_R14, # LR
+    15:ARM32_R15, # PC
+}
+
 def gen_prologue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    prologue = ""

@ -621,6 +665,38 @@ def gen_epilogue_aapcs64(self, arg_count, reg_count, stack_alloc_size, xmm_reg_c

    return epilogue

+def gen_prologue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+    assert(not stack_alloc_size and not xmm_reg_count)
+    prologue = ""
+    # Always spill at least 1 register (LR).
+    # LR needs to be saved for nested functions but for now we'll store it always
+    # since we don't differentiate between nested and leaf functions for arm yet.
+    registers_to_spill = []
+    logging.info(f"prologue {reg_count} > {self.volatile_registers}")
+    if reg_count > self.volatile_registers:
+        for i in range(self.volatile_registers, reg_count):
+            registers_to_spill.append('r%s' % i)
+        # Stack pointer is word 4B aligned
+        # required_stack_space = 4 * len(registers_to_spill)
+    registers_to_spill.append('lr')
+    prologue += "push {" + ",".join(registers_to_spill) + "}\n"
+    return prologue
+
+def gen_epilogue_aapcs32(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
+    assert(not stack_alloc_size and not xmm_reg_count)
+    epilogue = ""
+
+    registers_to_spill = []
+    logging.info(f"epilogue {reg_count} > {self.volatile_registers}")
+    if reg_count > self.volatile_registers:
+        for i in range(self.volatile_registers, reg_count):
+            registers_to_spill.append('r%s' % i)
+        # Stack pointer is word 4B aligned
+        # required_stack_space = 4 * len(registers_to_spill)
+    registers_to_spill.append('pc')
+    epilogue += "pop {" + ",".join(registers_to_spill) + "}\n"
+    return epilogue
+
 def gen_prologue_arm64ec(self, arg_count, reg_count, stack_alloc_size, xmm_reg_count):
    prologue = ""

@ -677,6 +753,10 @@ CALLING_CONVENTION_ARM64EC_MSFT = CallingConvention(
    "arm64ec_msft", "arm64", MAPPING_ARM64_ARM64ECMSFT, 8, 8, 16,
    gen_prologue_arm64ec, gen_epilogue_arm64ec, gen_get_memslot_offset_arm64)

+CALLING_CONVENTION_ARM32_AAPCS32 = CallingConvention(
+    "arm32_aapcs32", "arm32", MAPPING_ARM32_AAPCS32, 4, 4, 4,
+    gen_prologue_aapcs32, gen_epilogue_aapcs32, gen_get_memslot_offset_arm64)
+
 def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True):
    defines = ""
    if architecture == "amd64":
@ -687,6 +767,8 @@ def gen_function_defines(architecture, mapping, arg_count, reg_count, start=True
    elif architecture == "arm64":
        prefix64 = "X_"
        prefix32 = "W_"
+    elif architecture == "arm32":
+        return defines
    else:
        logging.error("Unhandled architecture (%s) in gen_function_defines" % architecture)
        exit(1)
@ -736,8 +818,8 @@ MASM_FUNCTION_TEMPLATE      = "%s, _TEXT\n"
 # ARMASM64 function macros must be correctly indented
 ARMASM64_FUNCTION_TEMPLATE  = "    %s\n"

-GAS_FUNCTION_ENTRY    = "%s: .global %s\n"
-GAS_FUNCTION_END      = ""
+GAS_FUNCTION_ENTRY    = "%s: .global %s\n.type %s, %%function\n// .func %s\n"
+GAS_FUNCTION_END      = "// .endfunc // %s"

 def generate_prologue(assembler, calling_convention, function_name, arg_count, reg_count, stack_alloc_size, xmm_reg_count, nested):
    function_entry = None
@ -756,7 +838,7 @@ def generate_prologue(assembler, calling_convention, function_name, arg_count, r
        elif assembler == "armasm64":
            function_entry = ARMASM64_FUNCTION_TEMPLATE % function_entry
    elif assembler == "gas":
-        function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name)
+        function_entry = GAS_FUNCTION_ENTRY % (function_name, function_name, function_name, function_name)
    else:
        logging.error("Unhandled assembler (%s) in generate_prologue" % assembler)
        exit(1)
@ -784,7 +866,7 @@ def generate_epilogue(assembler, calling_convention, function_name, arg_count, r
        elif assembler == "armasm64":
            function_end = ARMASM64_FUNCTION_TEMPLATE % function_end
    elif assembler == "gas":
-        function_end = GAS_FUNCTION_END
+        function_end = GAS_FUNCTION_END % function_name
    else:
        logging.error("Unhandled assembler (%s) in generate_epilogue" % assembler)
        exit(1)
@ -928,11 +1010,11 @@ class ProcessingStateMachine:
        else:
            self.calling_convention = self.normal_calling_convention

-        return generate_prologue(self.assembler, 
-                                 self.calling_convention, 
-                                 self.function_name, 
-                                 self.arg_count, 
-                                 self.reg_count, 
+        return generate_prologue(self.assembler,
+                                 self.calling_convention,
+                                 self.function_name,
+                                 self.arg_count,
+                                 self.reg_count,
                                 self.stack_alloc_size,
                                 self.xmm_reg_count,
                                 self.is_nested_function
@ -1115,6 +1197,10 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
            mul_calling_convention = None
            nested_calling_convention = None
+        elif architecture == "arm" and calling_convention == "aapcs32":
+            normal_calling_convention = CALLING_CONVENTION_ARM32_AAPCS32
+            mul_calling_convention = None
+            nested_calling_convention = None
    elif assembler == "armasm64":
        if architecture == "arm64" and calling_convention == "aapcs64":
            normal_calling_convention = CALLING_CONVENTION_ARM64_AAPCS64
@ -1149,7 +1235,7 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil

    # expand_files() is called recursively when a .symcryptasm file contains an INCLUDE directive,
    # except for the first call here where we're starting to process the input source file
-    # as if it was included by some other file. 
+    # as if it was included by some other file.
    expanded_file, infile_has_includes = expand_files(infilename, 0, "")
    expanded_lines.extend(expanded_file)

@ -1163,17 +1249,20 @@ def process_file(assembler, architecture, calling_convention, infilename, outfil
    with open(outfilename, "w") as outfile:
        for line_num, line in enumerate(expanded_lines):
            processed_line = file_processing_state.process_line(line, line_num)
+            # logging.info("processed line: %s" % processed_line)
            outfile.write(processed_line)

 if __name__ == "__main__":
    import argparse

+    # logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(description="Preprocess symcryptasm into files that will be further processed with C preprocessor to generate MASM or GAS")
    parser.add_argument('assembler', type=str, help='Assembler that we want to preprocess for', choices=['masm', 'gas', 'armasm64'])
-    parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64'])
-    parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec'])
+    parser.add_argument('architecture', type=str, help='Architecture that we want to preprocess for', choices=['amd64', 'arm64', 'arm'])
+    parser.add_argument('calling_convention', type=str, help='Calling convention that we want to preprocess for', choices=['msft', 'systemv', 'aapcs64', 'arm64ec', 'aapcs32'])
    parser.add_argument('inputfile', type=str, help='Path to input file')
    parser.add_argument('outputfile', type=str, help='Path to output file')

    args = parser.parse_args()
    process_file(args.assembler, args.architecture, args.calling_convention, args.inputfile, args.outputfile)
+    logging.info("Done")
--- a/unittest/exe_linux/CMakeLists.txt
+++ b/unittest/exe_linux/CMakeLists.txt
@ -18,6 +18,6 @@ target_link_libraries(symcryptunittest symcryptunittest_lib symcrypt_common atom

 # Export oe_sgx_get_additional_host_entropy from the executable so that if we dynamically load
 # oe module, the linker can find the version which is locally defined in the executable
-target_link_options(symcryptunittest PRIVATE 
+target_link_options(symcryptunittest PRIVATE
  -Wl,--dynamic-list=${CMAKE_CURRENT_SOURCE_DIR}/dynamic-list.ver
 )
--- a/unittest/lib/CMakeLists.txt
+++ b/unittest/lib/CMakeLists.txt
@ -97,5 +97,11 @@ else()
    add_compile_options(-DINCLUDE_IMPL_RSA32=0)
 endif()

+if(SYMCRYPT_TARGET_ARCH STREQUAL "ARM" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
+    # Hide warning due to abi change.
+    set_source_files_properties(kat.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
+    set_source_files_properties(perf.cpp PROPERTIES COMPILE_OPTIONS "-Wno-psabi")
+endif()
+
 add_library(symcryptunittest_lib STATIC ${SOURCES})
 set_target_properties(symcryptunittest_lib PROPERTIES PREFIX "")