Merged PR 9746161: Support cbNonce != 12 for AES-GCM

This change adds support for nonces larger or smaller than 12-bytes for AES-GCM (and GCM generally, if it's ever used with other block ciphers). It adds 32-bit CTR functions, as required in the GCM spec. Previously we used a 64-bit CTR function, which worked for 12-byte nonces because the counter block always started at 1 and the message length limit for GCM is 2^36 - 32 bytes, so the 32-bit counter would never overflow. Using a 64-bit counter does not work for non-12-byte nonces because the counter block starts at an arbitrary value computed by GHASHing the nonce.

It also updates the "stitched" implementations of AES-GCM to use 32-bit addition intrinsics instead of 64-bit addition.

Tested with unit tests:
- AMD64 with all features enabled
- AMD64 without aesni
- AMD64 without pclmulqdq
- AMD64 with everything disabled except for rdrand, rdseed, savexmmnofail
- ARM64 hardware (Galaxy Book 2) + qemu (via pipeline)

Related work items: #33824154
This commit is contained in:
Mitch Lindgren 🦎 2023-10-31 22:21:18 +00:00
Родитель 4359d75a43
Коммит d6933e03f6
16 изменённых файлов: 1191 добавлений и 564 удалений

Просмотреть файл

@ -6,6 +6,7 @@ prior to the creation of a new release, based on the changes contained in that r
- Extended SymCrypt support for XTS-AES adding support for 128-bit tweak and ciphertext-stealing
- Added support for salt length detection in RSA-PSS verification
- Export various constant time operations from SymCrypt Linux modules
- Added support for nonce sizes other than 12 bytes for AES-GCM
# Version 103.3.2

Просмотреть файл

@ -3495,7 +3495,7 @@ SymCryptMarvin32Selftest(void);
// for different block cipher computations as the expanded key is not modified once initialized.
//
// SymCryptXxxBlockCipher
// A SYMCRYPT_BLOCK_CIPHER structure that provides a description
// A SYMCRYPT_BLOCKCIPHER structure that provides a description
// of the block cipher and its primary functions. This is used by cipher modes to pass
// all the block-cipher specific information in a single structure.
//
@ -4152,8 +4152,7 @@ SymCryptCtrMsb64(
SIZE_T cbData );
//
// This function implements the CTR cipher mode.
// It is not intended to be used as-is, rather it is a building block
// for modes like CCM and GCM.
// It is not intended to be used as-is, rather it is a building block for modes like CCM.
// On some platforms we have optimized code for AES-CTR, on other platforms
// we use this generic construction to achieve the same effect.
//
@ -4174,7 +4173,6 @@ SymCryptCtrMsb64(
// buffers may be the same or non-overlapping, but may not partially overlap.
//
VOID
SYMCRYPT_CALL
SymCryptCfbEncrypt(

Просмотреть файл

@ -361,6 +361,51 @@ SymCryptAesCbcMac(
#endif
}
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb32(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
#if SYMCRYPT_CPU_AMD64
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) )
{
SymCryptAesCtrMsb32Xmm( pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
} else {
SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
SymCryptCtrMsb32( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
#elif SYMCRYPT_CPU_X86
SYMCRYPT_EXTENDED_SAVE_DATA SaveData;
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURES_FOR_AESNI_CODE ) &&
SymCryptSaveXmm( &SaveData ) == SYMCRYPT_NO_ERROR )
{
SymCryptAesCtrMsb32Xmm( pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
SymCryptRestoreXmm( &SaveData );
} else {
SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
SymCryptCtrMsb32( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
#elif SYMCRYPT_CPU_ARM64
if( SYMCRYPT_CPU_FEATURES_PRESENT( SYMCRYPT_CPU_FEATURE_NEON_AES ) )
{
SymCryptAesCtrMsb32Neon( pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
} else {
SymCryptCtrMsb32( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
}
#else
SYMCRYPT_ASSERT( SymCryptAesBlockCipherNoOpt.blockSize == SYMCRYPT_AES_BLOCK_SIZE ); // keep Prefast happy
SymCryptCtrMsb32( &SymCryptAesBlockCipherNoOpt, pExpandedKey, pbChainingValue, pbSrc, pbDst, cbData );
#endif
}
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb64(
@ -529,7 +574,7 @@ SymCryptAesGcmEncryptPartOnePass(
bytesToProcess );
#else
SymCryptAesCtrMsb64(&pState->pKey->blockcipherKey.aes,
SymCryptAesCtrMsb32(&pState->pKey->blockcipherKey.aes,
&pState->counterBlock[0],
pbSrc,
pbDst,
@ -563,7 +608,7 @@ SymCryptAesGcmEncryptPartOnePass(
SymCryptWipeKnownSize( &pState->keystreamBlock[0], SYMCRYPT_GCM_BLOCK_SIZE );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptAesCtrMsb64(&pState->pKey->blockcipherKey.aes,
SymCryptAesCtrMsb32(&pState->pKey->blockcipherKey.aes,
&pState->counterBlock[0],
&pState->keystreamBlock[0],
&pState->keystreamBlock[0],
@ -712,7 +757,7 @@ SymCryptAesGcmDecryptPartOnePass(
// This violates the read-once rule, but it is safe for the same reasons as above
// in the encryption case.
//
SymCryptAesCtrMsb64(&pState->pKey->blockcipherKey.aes,
SymCryptAesCtrMsb32(&pState->pKey->blockcipherKey.aes,
&pState->counterBlock[0],
pbSrc,
pbDst,
@ -729,7 +774,7 @@ SymCryptAesGcmDecryptPartOnePass(
SymCryptWipeKnownSize( &pState->keystreamBlock[0], SYMCRYPT_GCM_BLOCK_SIZE );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptAesCtrMsb64(&pState->pKey->blockcipherKey.aes,
SymCryptAesCtrMsb32(&pState->pKey->blockcipherKey.aes,
&pState->counterBlock[0],
&pState->keystreamBlock[0],
&pState->keystreamBlock[0],

Просмотреть файл

@ -769,172 +769,26 @@ SymCryptAesEcbEncryptNeon(
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
#pragma runtime_checks( "u", off )
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb64Neon(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
__n128 chain = *(__n128 *)pbChainingValue;
const __n128 * pSrc = (const __n128 *) pbSrc;
__n128 * pDst = (__n128 *) pbDst;
#define SYMCRYPT_AesCtrMsbXxNeon SymCryptAesCtrMsb64Neon
#define VADDQ_UXX vaddq_u64
#define VSUBQ_UXX vsubq_u64
const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
#include "aes-pattern.c"
__n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
#undef VSUBQ_UXX
#undef VADDQ_UXX
#undef SYMCRYPT_AesCtrMsbXxNeon
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
#define SYMCRYPT_AesCtrMsbXxNeon SymCryptAesCtrMsb32Neon
#define VADDQ_UXX vaddq_u32
#define VSUBQ_UXX vsubq_u32
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
ctr0 = vrev64q_u8( chain );
ctr1 = vaddq_u64( ctr0, chainIncrement1 );
ctr2 = vaddq_u64( ctr0, chainIncrement2 );
ctr3 = vaddq_u64( ctr1, chainIncrement2 );
ctr4 = vaddq_u64( ctr2, chainIncrement2 );
ctr5 = vaddq_u64( ctr3, chainIncrement2 );
ctr6 = vaddq_u64( ctr4, chainIncrement2 );
ctr7 = vaddq_u64( ctr5, chainIncrement2 );
#include "aes-pattern.c"
/*
while cbData >= 5 * block
generate 8 blocks of key stream
if cbData < 8 * block
break;
process 8 blocks
if cbData >= 5 * block
process 5-7 blocks
done
if cbData >= 2 * block
generate 4 blocks of key stream
process 2-4 blocks
done
if cbData == 1 block
generate 1 block of key stream
process block
*/
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
c0 = vrev64q_u8( ctr0 );
c1 = vrev64q_u8( ctr1 );
c2 = vrev64q_u8( ctr2 );
c3 = vrev64q_u8( ctr3 );
c4 = vrev64q_u8( ctr4 );
c5 = vrev64q_u8( ctr5 );
c6 = vrev64q_u8( ctr6 );
c7 = vrev64q_u8( ctr7 );
#undef VSUBQ_UXX
#undef VADDQ_UXX
#undef SYMCRYPT_AesCtrMsbXxNeon
ctr0 = vaddq_u64( ctr0, chainIncrement8 );
ctr1 = vaddq_u64( ctr1, chainIncrement8 );
ctr2 = vaddq_u64( ctr2, chainIncrement8 );
ctr3 = vaddq_u64( ctr3, chainIncrement8 );
ctr4 = vaddq_u64( ctr4, chainIncrement8 );
ctr5 = vaddq_u64( ctr5, chainIncrement8 );
ctr6 = vaddq_u64( ctr6, chainIncrement8 );
ctr7 = vaddq_u64( ctr7, chainIncrement8 );
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
{
break;
}
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
pDst[2] = veorq_u64( pSrc[2], c2 );
pDst[3] = veorq_u64( pSrc[3], c3 );
pDst[4] = veorq_u64( pSrc[4], c4 );
pDst[5] = veorq_u64( pSrc[5], c5 );
pDst[6] = veorq_u64( pSrc[6], c6 );
pDst[7] = veorq_u64( pSrc[7], c7 );
pDst += 8;
pSrc += 8;
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
}
//
// At this point we have one of the two following cases:
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. ctr0-ctr7 is set to (c0+8)-(c7+8)
// - cbData < 5 * 16 and we have no blocks of key stream, and ctr0-ctr7 set to the next 8 counters to use
//
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
{
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
//
// We already have the key stream
//
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
pDst[2] = veorq_u64( pSrc[2], c2 );
pDst[3] = veorq_u64( pSrc[3], c3 );
pDst[4] = veorq_u64( pSrc[4], c4 );
chain = vsubq_u64( ctr5, chainIncrement8 );
if( cbData >= 96 )
{
chain = vsubq_u64( ctr6, chainIncrement8 );
pDst[5] = veorq_u64( pSrc[5], c5 );
if( cbData >= 112 )
{
chain = vsubq_u64( ctr7, chainIncrement8 );
pDst[6] = veorq_u64( pSrc[6], c6 );
}
}
}
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
{
// Produce 4 blocks of key stream
chain = ctr2; // chain is only incremented by 2 for now
c0 = vrev64q_u8( ctr0 );
c1 = vrev64q_u8( ctr1 );
c2 = vrev64q_u8( ctr2 );
c3 = vrev64q_u8( ctr3 );
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
if( cbData >= 48 )
{
chain = ctr3;
pDst[2] = veorq_u64( pSrc[2], c2 );
if( cbData >= 64 )
{
chain = ctr4;
pDst[3] = veorq_u64( pSrc[3], c3 );
}
}
}
else
{
// Exactly 1 block to process
chain = ctr1;
c0 = vrev64q_u8( ctr0 );
AES_ENCRYPT_1( pExpandedKey, c0 );
pDst[0] = veorq_u64( pSrc[0], c0 );
}
}
else
{
chain = ctr0;
}
chain = vrev64q_u8( chain );
*(__n128 *)pbChainingValue = chain;
}
#pragma runtime_checks( "u", restore )
#pragma warning(pop)
@ -1662,13 +1516,13 @@ SymCryptAesGcmEncryptStitchedNeon(
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
ctr0 = vrev64q_u8( chain );
ctr1 = vaddq_u64( ctr0, chainIncrement1 );
ctr2 = vaddq_u64( ctr0, chainIncrement2 );
ctr3 = vaddq_u64( ctr1, chainIncrement2 );
ctr4 = vaddq_u64( ctr2, chainIncrement2 );
ctr5 = vaddq_u64( ctr3, chainIncrement2 );
ctr6 = vaddq_u64( ctr4, chainIncrement2 );
ctr7 = vaddq_u64( ctr5, chainIncrement2 );
ctr1 = vaddq_u32( ctr0, chainIncrement1 );
ctr2 = vaddq_u32( ctr0, chainIncrement2 );
ctr3 = vaddq_u32( ctr1, chainIncrement2 );
ctr4 = vaddq_u32( ctr2, chainIncrement2 );
ctr5 = vaddq_u32( ctr3, chainIncrement2 );
ctr6 = vaddq_u32( ctr4, chainIncrement2 );
ctr7 = vaddq_u32( ctr5, chainIncrement2 );
state = *(__n128 *) pState;
@ -1689,14 +1543,14 @@ SymCryptAesGcmEncryptStitchedNeon(
if ( cbData >= 8 * SYMCRYPT_AES_BLOCK_SIZE )
{
ctr0 = vaddq_u64( ctr0, chainIncrement8 );
ctr1 = vaddq_u64( ctr1, chainIncrement8 );
ctr2 = vaddq_u64( ctr2, chainIncrement8 );
ctr3 = vaddq_u64( ctr3, chainIncrement8 );
ctr4 = vaddq_u64( ctr4, chainIncrement8 );
ctr5 = vaddq_u64( ctr5, chainIncrement8 );
ctr6 = vaddq_u64( ctr6, chainIncrement8 );
ctr7 = vaddq_u64( ctr7, chainIncrement8 );
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
// Encrypt first 8 blocks
pDst[0] = veorq_u64( pSrc[0], c0 );
@ -1723,14 +1577,14 @@ SymCryptAesGcmEncryptStitchedNeon(
c6 = vrev64q_u8( ctr6 );
c7 = vrev64q_u8( ctr7 );
ctr0 = vaddq_u64( ctr0, chainIncrement8 );
ctr1 = vaddq_u64( ctr1, chainIncrement8 );
ctr2 = vaddq_u64( ctr2, chainIncrement8 );
ctr3 = vaddq_u64( ctr3, chainIncrement8 );
ctr4 = vaddq_u64( ctr4, chainIncrement8 );
ctr5 = vaddq_u64( ctr5, chainIncrement8 );
ctr6 = vaddq_u64( ctr6, chainIncrement8 );
ctr7 = vaddq_u64( ctr7, chainIncrement8 );
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
@ -1814,7 +1668,7 @@ SymCryptAesGcmEncryptStitchedNeon(
// Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
while( nBlocks >= 2 )
{
ctr0 = vaddq_u64( ctr0, chainIncrement2 );
ctr0 = vaddq_u32( ctr0, chainIncrement2 );
r0 = veorq_u64( pSrc[0], c0 );
r1 = veorq_u64( pSrc[1], c1 );
@ -1845,7 +1699,7 @@ SymCryptAesGcmEncryptStitchedNeon(
if( nBlocks > 0 )
{
ctr0 = vaddq_u64( ctr0, chainIncrement1 );
ctr0 = vaddq_u32( ctr0, chainIncrement1 );
r0 = veorq_u64( pSrc[0], c0 );
pDst[0] = r0;
@ -1911,13 +1765,13 @@ SymCryptAesGcmDecryptStitchedNeon(
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
ctr0 = vrev64q_u8( chain );
ctr1 = vaddq_u64( ctr0, chainIncrement1 );
ctr2 = vaddq_u64( ctr0, chainIncrement2 );
ctr3 = vaddq_u64( ctr1, chainIncrement2 );
ctr4 = vaddq_u64( ctr2, chainIncrement2 );
ctr5 = vaddq_u64( ctr3, chainIncrement2 );
ctr6 = vaddq_u64( ctr4, chainIncrement2 );
ctr7 = vaddq_u64( ctr5, chainIncrement2 );
ctr1 = vaddq_u32( ctr0, chainIncrement1 );
ctr2 = vaddq_u32( ctr0, chainIncrement2 );
ctr3 = vaddq_u32( ctr1, chainIncrement2 );
ctr4 = vaddq_u32( ctr2, chainIncrement2 );
ctr5 = vaddq_u32( ctr3, chainIncrement2 );
ctr6 = vaddq_u32( ctr4, chainIncrement2 );
ctr7 = vaddq_u32( ctr5, chainIncrement2 );
state = *(__n128 *) pState;
@ -1937,14 +1791,14 @@ SymCryptAesGcmDecryptStitchedNeon(
c6 = vrev64q_u8( ctr6 );
c7 = vrev64q_u8( ctr7 );
ctr0 = vaddq_u64( ctr0, chainIncrement8 );
ctr1 = vaddq_u64( ctr1, chainIncrement8 );
ctr2 = vaddq_u64( ctr2, chainIncrement8 );
ctr3 = vaddq_u64( ctr3, chainIncrement8 );
ctr4 = vaddq_u64( ctr4, chainIncrement8 );
ctr5 = vaddq_u64( ctr5, chainIncrement8 );
ctr6 = vaddq_u64( ctr6, chainIncrement8 );
ctr7 = vaddq_u64( ctr7, chainIncrement8 );
ctr0 = vaddq_u32( ctr0, chainIncrement8 );
ctr1 = vaddq_u32( ctr1, chainIncrement8 );
ctr2 = vaddq_u32( ctr2, chainIncrement8 );
ctr3 = vaddq_u32( ctr3, chainIncrement8 );
ctr4 = vaddq_u32( ctr4, chainIncrement8 );
ctr5 = vaddq_u32( ctr5, chainIncrement8 );
ctr6 = vaddq_u32( ctr6, chainIncrement8 );
ctr7 = vaddq_u32( ctr7, chainIncrement8 );
AES_GCM_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pGhashSrc, 8, expandedKeyTable, todo, a0, a1, a2 );
@ -1999,7 +1853,7 @@ SymCryptAesGcmDecryptStitchedNeon(
// Decrypt 1-7 blocks with pre-generated AES-CTR blocks
while( nBlocks >= 2 )
{
ctr0 = vaddq_u64( ctr0, chainIncrement2 );
ctr0 = vaddq_u32( ctr0, chainIncrement2 );
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
@ -2016,7 +1870,7 @@ SymCryptAesGcmDecryptStitchedNeon(
if( nBlocks > 0 )
{
ctr0 = vaddq_u64( ctr0, chainIncrement1 );
ctr0 = vaddq_u32( ctr0, chainIncrement1 );
pDst[0] = veorq_u64( pSrc[0], c0 );
}

344
lib/aes-pattern.c Normal file
Просмотреть файл

@ -0,0 +1,344 @@
//
// aes-pattern.c
//
// Copyright (c) Microsoft Corporation. Licensed under the MIT license.
//
// This file contains "pattern" code for AES-related functions. It's not intended to be compiled
// directly; rather it is included by other aes-*.c files which define the macros used here.
//
#if SYMCRYPT_CPU_ARM64
VOID
SYMCRYPT_CALL
SYMCRYPT_AesCtrMsbXxNeon(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
__n128 chain = *(__n128 *)pbChainingValue;
const __n128 * pSrc = (const __n128 *) pbSrc;
__n128 * pDst = (__n128 *) pbDst;
const __n128 chainIncrement1 = SYMCRYPT_SET_N128_U64( 0, 1 );
const __n128 chainIncrement2 = SYMCRYPT_SET_N128_U64( 0, 2 );
const __n128 chainIncrement8 = SYMCRYPT_SET_N128_U64( 0, 8 );
__n128 ctr0, ctr1, ctr2, ctr3, ctr4, ctr5, ctr6, ctr7;
__n128 c0, c1, c2, c3, c4, c5, c6, c7;
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
// Our chain variable is in integer format, not the MSBfirst format loaded from memory.
ctr0 = vrev64q_u8( chain );
ctr1 = VADDQ_UXX( ctr0, chainIncrement1 );
ctr2 = VADDQ_UXX( ctr0, chainIncrement2 );
ctr3 = VADDQ_UXX( ctr1, chainIncrement2 );
ctr4 = VADDQ_UXX( ctr2, chainIncrement2 );
ctr5 = VADDQ_UXX( ctr3, chainIncrement2 );
ctr6 = VADDQ_UXX( ctr4, chainIncrement2 );
ctr7 = VADDQ_UXX( ctr5, chainIncrement2 );
/*
while cbData >= 5 * block
generate 8 blocks of key stream
if cbData < 8 * block
break;
process 8 blocks
if cbData >= 5 * block
process 5-7 blocks
done
if cbData >= 2 * block
generate 4 blocks of key stream
process 2-4 blocks
done
if cbData == 1 block
generate 1 block of key stream
process block
*/
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
c0 = vrev64q_u8( ctr0 );
c1 = vrev64q_u8( ctr1 );
c2 = vrev64q_u8( ctr2 );
c3 = vrev64q_u8( ctr3 );
c4 = vrev64q_u8( ctr4 );
c5 = vrev64q_u8( ctr5 );
c6 = vrev64q_u8( ctr6 );
c7 = vrev64q_u8( ctr7 );
ctr0 = VADDQ_UXX( ctr0, chainIncrement8 );
ctr1 = VADDQ_UXX( ctr1, chainIncrement8 );
ctr2 = VADDQ_UXX( ctr2, chainIncrement8 );
ctr3 = VADDQ_UXX( ctr3, chainIncrement8 );
ctr4 = VADDQ_UXX( ctr4, chainIncrement8 );
ctr5 = VADDQ_UXX( ctr5, chainIncrement8 );
ctr6 = VADDQ_UXX( ctr6, chainIncrement8 );
ctr7 = VADDQ_UXX( ctr7, chainIncrement8 );
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
{
break;
}
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
pDst[2] = veorq_u64( pSrc[2], c2 );
pDst[3] = veorq_u64( pSrc[3], c3 );
pDst[4] = veorq_u64( pSrc[4], c4 );
pDst[5] = veorq_u64( pSrc[5], c5 );
pDst[6] = veorq_u64( pSrc[6], c6 );
pDst[7] = veorq_u64( pSrc[7], c7 );
pDst += 8;
pSrc += 8;
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
}
//
// At this point we have one of the two following cases:
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. ctr0-ctr7 is set to (c0+8)-(c7+8)
// - cbData < 5 * 16 and we have no blocks of key stream, and ctr0-ctr7 set to the next 8 counters to use
//
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
{
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
//
// We already have the key stream
//
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
pDst[2] = veorq_u64( pSrc[2], c2 );
pDst[3] = veorq_u64( pSrc[3], c3 );
pDst[4] = veorq_u64( pSrc[4], c4 );
chain = VSUBQ_UXX( ctr5, chainIncrement8 );
if( cbData >= 96 )
{
chain = VSUBQ_UXX( ctr6, chainIncrement8 );
pDst[5] = veorq_u64( pSrc[5], c5 );
if( cbData >= 112 )
{
chain = VSUBQ_UXX( ctr7, chainIncrement8 );
pDst[6] = veorq_u64( pSrc[6], c6 );
}
}
}
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
{
// Produce 4 blocks of key stream
chain = ctr2; // chain is only incremented by 2 for now
c0 = vrev64q_u8( ctr0 );
c1 = vrev64q_u8( ctr1 );
c2 = vrev64q_u8( ctr2 );
c3 = vrev64q_u8( ctr3 );
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
pDst[0] = veorq_u64( pSrc[0], c0 );
pDst[1] = veorq_u64( pSrc[1], c1 );
if( cbData >= 48 )
{
chain = ctr3;
pDst[2] = veorq_u64( pSrc[2], c2 );
if( cbData >= 64 )
{
chain = ctr4;
pDst[3] = veorq_u64( pSrc[3], c3 );
}
}
}
else
{
// Exactly 1 block to process
chain = ctr1;
c0 = vrev64q_u8( ctr0 );
AES_ENCRYPT_1( pExpandedKey, c0 );
pDst[0] = veorq_u64( pSrc[0], c0 );
}
}
else
{
chain = ctr0;
}
chain = vrev64q_u8( chain );
*(__n128 *)pbChainingValue = chain;
}
#endif // SYMCRYPT_CPU_ARM64
#if SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64
VOID
SYMCRYPT_CALL
SYMCRYPT_AesCtrMsbXxXmm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
__m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
__m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
__m128i chainIncrement3 = _mm_set_epi32( 0, 0, 0, 3 );
//__m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 );
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
/*
while cbData >= 5 * block
generate 8 blocks of key stream
if cbData < 8 * block
break;
process 8 blocks
if cbData >= 5 * block
process 5-7 blocks
done
if cbData > 1 block
generate 4 blocks of key stream
process 2-4 blocks
done
if cbData == 1 block
generate 1 block of key stream
process block
*/
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
c0 = chain;
c1 = MM_ADD_EPIXX( chain, chainIncrement1 );
c2 = MM_ADD_EPIXX( chain, chainIncrement2 );
c3 = MM_ADD_EPIXX( c1, chainIncrement2 );
c4 = MM_ADD_EPIXX( c2, chainIncrement2 );
c5 = MM_ADD_EPIXX( c3, chainIncrement2 );
c6 = MM_ADD_EPIXX( c4, chainIncrement2 );
c7 = MM_ADD_EPIXX( c5, chainIncrement2 );
chain = MM_ADD_EPIXX( c6, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
{
break;
}
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ) ) ) );
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
}
//
// At this point we have one of the two following cases:
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. chain is set to c7 + 1
// - cbData < 5 * 16 and we have no blocks of key stream, with chain the next value to use
//
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
{
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
//
// We already have the key stream
//
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
chain = MM_SUB_EPIXX( chain, chainIncrement3 );
if( cbData >= 96 )
{
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
if( cbData >= 112 )
{
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
}
}
}
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
{
// Produce 4 blocks of key stream
c0 = chain;
c1 = MM_ADD_EPIXX( chain, chainIncrement1 );
c2 = MM_ADD_EPIXX( chain, chainIncrement2 );
c3 = MM_ADD_EPIXX( c1, chainIncrement2 );
chain = c2; // chain is only incremented by 2 for now
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
if( cbData >= 48 )
{
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
if( cbData >= 64 )
{
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
}
}
}
else
{
// Exactly 1 block to process
c0 = chain;
chain = MM_ADD_EPIXX( chain, chainIncrement1 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
AES_ENCRYPT_1( pExpandedKey, c0 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
}
}
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
}
#endif // SYMCRYPT_CPU_X86 | SYMCRYPT_CPU_AMD64

Просмотреть файл

@ -727,166 +727,26 @@ SymCryptAesCbcMacXmm(
#pragma warning( disable:4701 ) // "Use of uninitialized variable"
#pragma runtime_checks( "u", off )
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb64Xmm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
__m128i chain = _mm_loadu_si128( (__m128i *) pbChainingValue );
#define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb64Xmm
#define MM_ADD_EPIXX _mm_add_epi64
#define MM_SUB_EPIXX _mm_sub_epi64
__m128i BYTE_REVERSE_ORDER = _mm_set_epi8(
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 );
#include "aes-pattern.c"
__m128i chainIncrement1 = _mm_set_epi32( 0, 0, 0, 1 );
__m128i chainIncrement2 = _mm_set_epi32( 0, 0, 0, 2 );
__m128i chainIncrement3 = _mm_set_epi32( 0, 0, 0, 3 );
//__m128i chainIncrement8 = _mm_set_epi32( 0, 0, 0, 8 );
#undef MM_SUB_EPIXX
#undef MM_ADD_EPIXX
#undef SYMCRYPT_AesCtrMsbXxXmm
__m128i c0, c1, c2, c3, c4, c5, c6, c7;
#define SYMCRYPT_AesCtrMsbXxXmm SymCryptAesCtrMsb32Xmm
#define MM_ADD_EPIXX _mm_add_epi32
#define MM_SUB_EPIXX _mm_sub_epi32
cbData &= ~(SYMCRYPT_AES_BLOCK_SIZE - 1);
#include "aes-pattern.c"
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
#undef MM_SUB_EPIXX
#undef MM_ADD_EPIXX
#undef SYMCRYPT_AesCtrMsbXxXmm
/*
while cbData >= 5 * block
generate 8 blocks of key stream
if cbData < 8 * block
break;
process 8 blocks
if cbData >= 5 * block
process 5-7 blocks
done
if cbData > 1 block
generate 4 blocks of key stream
process 2-4 blocks
done
if cbData == 1 block
generate 1 block of key stream
process block
*/
while( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c5 = _mm_add_epi64( c3, chainIncrement2 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c7 = _mm_add_epi64( c5, chainIncrement2 );
chain = _mm_add_epi64( c6, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
c6 = _mm_shuffle_epi8( c6, BYTE_REVERSE_ORDER );
c7 = _mm_shuffle_epi8( c7, BYTE_REVERSE_ORDER );
AES_ENCRYPT_8( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
if( cbData < 8 * SYMCRYPT_AES_BLOCK_SIZE )
{
break;
}
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst +112), _mm_xor_si128( c7, _mm_loadu_si128( ( __m128i * ) (pbSrc +112 ) ) ) );
pbDst += 8 * SYMCRYPT_AES_BLOCK_SIZE;
pbSrc += 8 * SYMCRYPT_AES_BLOCK_SIZE;
cbData -= 8 * SYMCRYPT_AES_BLOCK_SIZE;
}
//
// At this point we have one of the two following cases:
// - cbData >= 5 * 16 and we have 8 blocks of key stream in c0-c7. chain is set to c7 + 1
// - cbData < 5 * 16 and we have no blocks of key stream, with chain the next value to use
//
if( cbData >= SYMCRYPT_AES_BLOCK_SIZE ) // quick exit of function if the request was a multiple of 8 blocks
{
if( cbData >= 5 * SYMCRYPT_AES_BLOCK_SIZE )
{
//
// We already have the key stream
//
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 64), _mm_xor_si128( c4, _mm_loadu_si128( ( __m128i * ) (pbSrc + 64 ) ) ) );
chain = _mm_sub_epi64( chain, chainIncrement3 );
if( cbData >= 96 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 80), _mm_xor_si128( c5, _mm_loadu_si128( ( __m128i * ) (pbSrc + 80 ) ) ) );
if( cbData >= 112 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 96), _mm_xor_si128( c6, _mm_loadu_si128( ( __m128i * ) (pbSrc + 96 ) ) ) );
}
}
}
else if( cbData >= 2 * SYMCRYPT_AES_BLOCK_SIZE )
{
// Produce 4 blocks of key stream
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
chain = c2; // chain is only incremented by 2 for now
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
c2 = _mm_shuffle_epi8( c2, BYTE_REVERSE_ORDER );
c3 = _mm_shuffle_epi8( c3, BYTE_REVERSE_ORDER );
AES_ENCRYPT_4( pExpandedKey, c0, c1, c2, c3 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16 ) ) ) );
if( cbData >= 48 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 32), _mm_xor_si128( c2, _mm_loadu_si128( ( __m128i * ) (pbSrc + 32 ) ) ) );
if( cbData >= 64 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 48), _mm_xor_si128( c3, _mm_loadu_si128( ( __m128i * ) (pbSrc + 48 ) ) ) );
}
}
}
else
{
// Exactly 1 block to process
c0 = chain;
chain = _mm_add_epi64( chain, chainIncrement1 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
AES_ENCRYPT_1( pExpandedKey, c0 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0 ) ) ) );
}
}
chain = _mm_shuffle_epi8( chain, BYTE_REVERSE_ORDER );
_mm_storeu_si128( (__m128i *) pbChainingValue, chain );
}
#pragma runtime_checks( "u", restore )
#pragma warning(pop)
@ -1558,13 +1418,13 @@ SymCryptAesGcmEncryptStitchedXmm(
// Do 8 blocks of CTR either for tail (if total blocks <8) or for encryption of first 8 blocks
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c5 = _mm_add_epi64( c3, chainIncrement2 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c7 = _mm_add_epi64( c5, chainIncrement2 );
c1 = _mm_add_epi32( chain, chainIncrement1 );
c2 = _mm_add_epi32( chain, chainIncrement2 );
c3 = _mm_add_epi32( c1, chainIncrement2 );
c4 = _mm_add_epi32( c2, chainIncrement2 );
c5 = _mm_add_epi32( c3, chainIncrement2 );
c6 = _mm_add_epi32( c4, chainIncrement2 );
c7 = _mm_add_epi32( c5, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
@ -1580,7 +1440,7 @@ SymCryptAesGcmEncryptStitchedXmm(
if( nBlocks >= 8 )
{
// Encrypt first 8 blocks - update chain
chain = _mm_add_epi64( chain, chainIncrement8 );
chain = _mm_add_epi32( chain, chainIncrement8 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
@ -1598,14 +1458,14 @@ SymCryptAesGcmEncryptStitchedXmm(
{
// In this loop we always have 8 blocks to encrypt and we have already encrypted the previous 8 blocks ready for GHASH
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c5 = _mm_add_epi64( c3, chainIncrement2 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c7 = _mm_add_epi64( c5, chainIncrement2 );
chain = _mm_add_epi64( c6, chainIncrement2 );
c1 = _mm_add_epi32( chain, chainIncrement1 );
c2 = _mm_add_epi32( chain, chainIncrement2 );
c3 = _mm_add_epi32( c1, chainIncrement2 );
c4 = _mm_add_epi32( c2, chainIncrement2 );
c5 = _mm_add_epi32( c3, chainIncrement2 );
c6 = _mm_add_epi32( c4, chainIncrement2 );
c7 = _mm_add_epi32( c5, chainIncrement2 );
chain = _mm_add_epi32( c6, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
@ -1647,10 +1507,10 @@ SymCryptAesGcmEncryptStitchedXmm(
if (nBlocks > 0)
{
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c1 = _mm_add_epi32( chain, chainIncrement1 );
c2 = _mm_add_epi32( chain, chainIncrement2 );
c3 = _mm_add_epi32( c1, chainIncrement2 );
c4 = _mm_add_epi32( c2, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
@ -1660,8 +1520,8 @@ SymCryptAesGcmEncryptStitchedXmm(
if (nBlocks > 4)
{
// Do 8 rounds of AES-CTR for tail in parallel with 8 rounds of GHASH
c5 = _mm_add_epi64( c4, chainIncrement1 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c5 = _mm_add_epi32( c4, chainIncrement1 );
c6 = _mm_add_epi32( c4, chainIncrement2 );
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
@ -1705,7 +1565,7 @@ SymCryptAesGcmEncryptStitchedXmm(
// Encrypt 1-7 blocks with pre-generated AES-CTR blocks and GHASH the results
while( nBlocks >= 2 )
{
chain = _mm_add_epi64( chain, chainIncrement2 );
chain = _mm_add_epi32( chain, chainIncrement2 );
r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
r1 = _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) );
@ -1732,7 +1592,7 @@ SymCryptAesGcmEncryptStitchedXmm(
if( nBlocks > 0 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
chain = _mm_add_epi32( chain, chainIncrement1 );
r0 = _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) );
@ -1805,14 +1665,14 @@ SymCryptAesGcmDecryptStitchedXmm(
{
// In this loop we always have 8 blocks to decrypt and GHASH
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c5 = _mm_add_epi64( c3, chainIncrement2 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c7 = _mm_add_epi64( c5, chainIncrement2 );
chain = _mm_add_epi64( c6, chainIncrement2 );
c1 = _mm_add_epi32( chain, chainIncrement1 );
c2 = _mm_add_epi32( chain, chainIncrement2 );
c3 = _mm_add_epi32( c1, chainIncrement2 );
c4 = _mm_add_epi32( c2, chainIncrement2 );
c5 = _mm_add_epi32( c3, chainIncrement2 );
c6 = _mm_add_epi32( c4, chainIncrement2 );
c7 = _mm_add_epi32( c5, chainIncrement2 );
chain = _mm_add_epi32( c6, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
@ -1856,10 +1716,10 @@ SymCryptAesGcmDecryptStitchedXmm(
// We have 1-7 blocks to GHASH and decrypt
// Do the exact number of GHASH blocks we need in parallel with generating either 4 or 8 blocks of AES-CTR
c0 = chain;
c1 = _mm_add_epi64( chain, chainIncrement1 );
c2 = _mm_add_epi64( chain, chainIncrement2 );
c3 = _mm_add_epi64( c1, chainIncrement2 );
c4 = _mm_add_epi64( c2, chainIncrement2 );
c1 = _mm_add_epi32( chain, chainIncrement1 );
c2 = _mm_add_epi32( chain, chainIncrement2 );
c3 = _mm_add_epi32( c1, chainIncrement2 );
c4 = _mm_add_epi32( c2, chainIncrement2 );
c0 = _mm_shuffle_epi8( c0, BYTE_REVERSE_ORDER );
c1 = _mm_shuffle_epi8( c1, BYTE_REVERSE_ORDER );
@ -1868,8 +1728,8 @@ SymCryptAesGcmDecryptStitchedXmm(
if( nBlocks > 4 )
{
c5 = _mm_add_epi64( c4, chainIncrement1 );
c6 = _mm_add_epi64( c4, chainIncrement2 );
c5 = _mm_add_epi32( c4, chainIncrement1 );
c6 = _mm_add_epi32( c4, chainIncrement2 );
c4 = _mm_shuffle_epi8( c4, BYTE_REVERSE_ORDER );
c5 = _mm_shuffle_epi8( c5, BYTE_REVERSE_ORDER );
@ -1886,7 +1746,7 @@ SymCryptAesGcmDecryptStitchedXmm(
// Decrypt 1-7 blocks with pre-generated AES-CTR blocks
while( nBlocks >= 2 )
{
chain = _mm_add_epi64( chain, chainIncrement2 );
chain = _mm_add_epi32( chain, chainIncrement2 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
_mm_storeu_si128( (__m128i *) (pbDst + 16), _mm_xor_si128( c1, _mm_loadu_si128( ( __m128i * ) (pbSrc + 16) ) ) );
@ -1903,7 +1763,7 @@ SymCryptAesGcmDecryptStitchedXmm(
if( nBlocks > 0 )
{
chain = _mm_add_epi64( chain, chainIncrement1 );
chain = _mm_add_epi32( chain, chainIncrement1 );
_mm_storeu_si128( (__m128i *) (pbDst + 0), _mm_xor_si128( c0, _mm_loadu_si128( ( __m128i * ) (pbSrc + 0) ) ) );
}

Просмотреть файл

@ -497,14 +497,14 @@ SymCryptAesGcmEncryptStitchedYmm_2048(
state = _mm_loadu_si128( (__m128i *) pState );
ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
ctr0 = _mm256_add_epi64( ctr0, chainIncrementUpper1 );
ctr1 = _mm256_add_epi64( ctr0, chainIncrement2 );
ctr2 = _mm256_add_epi64( ctr0, chainIncrement4 );
ctr3 = _mm256_add_epi64( ctr1, chainIncrement4 );
ctr4 = _mm256_add_epi64( ctr2, chainIncrement4 );
ctr5 = _mm256_add_epi64( ctr3, chainIncrement4 );
ctr6 = _mm256_add_epi64( ctr4, chainIncrement4 );
ctr7 = _mm256_add_epi64( ctr5, chainIncrement4 );
ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
a0 = a1 = a2 = _mm256_setzero_si256();
@ -518,14 +518,14 @@ SymCryptAesGcmEncryptStitchedYmm_2048(
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
ctr0 = _mm256_add_epi64( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi64( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi64( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi64( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi64( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi64( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi64( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi64( ctr7, chainIncrement16 );
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
AES_ENCRYPT_YMM_2048( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7 );
@ -552,14 +552,14 @@ SymCryptAesGcmEncryptStitchedYmm_2048(
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
ctr0 = _mm256_add_epi64( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi64( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi64( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi64( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi64( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi64( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi64( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi64( ctr7, chainIncrement16 );
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );
@ -694,14 +694,14 @@ SymCryptAesGcmDecryptStitchedYmm_2048(
state = _mm_loadu_si128( (__m128i *) pState );
ctr0 = _mm256_insertf128_si256( _mm256_castsi128_si256( chain ), chain, 1); // AVX
ctr0 = _mm256_add_epi64( ctr0, chainIncrementUpper1 );
ctr1 = _mm256_add_epi64( ctr0, chainIncrement2 );
ctr2 = _mm256_add_epi64( ctr0, chainIncrement4 );
ctr3 = _mm256_add_epi64( ctr1, chainIncrement4 );
ctr4 = _mm256_add_epi64( ctr2, chainIncrement4 );
ctr5 = _mm256_add_epi64( ctr3, chainIncrement4 );
ctr6 = _mm256_add_epi64( ctr4, chainIncrement4 );
ctr7 = _mm256_add_epi64( ctr5, chainIncrement4 );
ctr0 = _mm256_add_epi32( ctr0, chainIncrementUpper1 );
ctr1 = _mm256_add_epi32( ctr0, chainIncrement2 );
ctr2 = _mm256_add_epi32( ctr0, chainIncrement4 );
ctr3 = _mm256_add_epi32( ctr1, chainIncrement4 );
ctr4 = _mm256_add_epi32( ctr2, chainIncrement4 );
ctr5 = _mm256_add_epi32( ctr3, chainIncrement4 );
ctr6 = _mm256_add_epi32( ctr4, chainIncrement4 );
ctr7 = _mm256_add_epi32( ctr5, chainIncrement4 );
CLMUL_3( state, GHASH_H_POWER(expandedKeyTable, todo), GHASH_Hx_POWER(expandedKeyTable, todo), a0_xmm, a1_xmm, a2_xmm );
a0 = a1 = a2 = _mm256_setzero_si256();
@ -717,14 +717,14 @@ SymCryptAesGcmDecryptStitchedYmm_2048(
c6 = _mm256_shuffle_epi8( ctr6, BYTE_REVERSE_ORDER );
c7 = _mm256_shuffle_epi8( ctr7, BYTE_REVERSE_ORDER );
ctr0 = _mm256_add_epi64( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi64( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi64( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi64( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi64( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi64( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi64( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi64( ctr7, chainIncrement16 );
ctr0 = _mm256_add_epi32( ctr0, chainIncrement16 );
ctr1 = _mm256_add_epi32( ctr1, chainIncrement16 );
ctr2 = _mm256_add_epi32( ctr2, chainIncrement16 );
ctr3 = _mm256_add_epi32( ctr3, chainIncrement16 );
ctr4 = _mm256_add_epi32( ctr4, chainIncrement16 );
ctr5 = _mm256_add_epi32( ctr5, chainIncrement16 );
ctr6 = _mm256_add_epi32( ctr6, chainIncrement16 );
ctr7 = _mm256_add_epi32( ctr7, chainIncrement16 );
AES_GCM_ENCRYPT_16_Ymm( pExpandedKey, c0, c1, c2, c3, c4, c5, c6, c7, pbGhashSrc, BYTE_REVERSE_ORDER, expandedKeyTable, todo, a0, a1, a2 );

Просмотреть файл

@ -244,6 +244,57 @@ SymCryptCbcMac(
SymCryptWipeKnownSize( buf, sizeof( buf ));
}
VOID
SYMCRYPT_CALL
SymCryptCtrMsb32(
_In_ PCSYMCRYPT_BLOCKCIPHER pBlockCipher,
_In_ PCVOID pExpandedKey,
_Inout_updates_( pBlockCipher->blockSize )
PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData )
{
SYMCRYPT_ALIGN BYTE buf[2 * SYMCRYPT_MAX_BLOCK_SIZE];
PBYTE count = &buf[0];
PBYTE keystream= &buf[SYMCRYPT_MAX_BLOCK_SIZE];
SIZE_T blockSize;
PCBYTE pbSrcEnd;
blockSize = pBlockCipher->blockSize;
SYMCRYPT_ASSERT( blockSize <= SYMCRYPT_MAX_BLOCK_SIZE );
//
// Compute the end of the data, rounding the size down to a multiple of the block size.
//
pbSrcEnd = &pbSrc[ cbData & ~(blockSize - 1) ];
//
// We keep the chaining state in a local buffer to enforce the read-once write-once rule.
// It also improves memory locality.
//
#pragma warning(suppress: 22105)
memcpy( count, pbChainingValue, blockSize );
while( pbSrc < pbSrcEnd )
{
SYMCRYPT_ASSERT( pbSrc <= pbSrcEnd - blockSize ); // help PreFast
(*pBlockCipher->encryptFunc)( pExpandedKey, count, keystream );
SymCryptXorBytes( keystream, pbSrc, pbDst, blockSize );
//
// We only need to increment the last 32 bits of the counter value.
//
SYMCRYPT_STORE_MSBFIRST32( &count[ blockSize-4 ], 1 + SYMCRYPT_LOAD_MSBFIRST32( &count[ blockSize-4 ] ) );
pbSrc += blockSize;
pbDst += blockSize;
}
memcpy( pbChainingValue, count, blockSize );
SymCryptWipeKnownSize( buf, sizeof( buf ));
}
VOID
SYMCRYPT_CALL
SymCryptCtrMsb64(

152
lib/gcm.c
Просмотреть файл

@ -6,7 +6,7 @@
#include "precomp.h"
#define GCM_REQUIRED_NONCE_SIZE (12)
#define GCM_MIN_NONCE_SIZE (1)
#define GCM_MIN_TAG_SIZE (12)
#define GCM_MAX_TAG_SIZE (16)
@ -26,9 +26,10 @@ SymCryptGcmValidateParameters(
}
//
// We only support 12-byte nonces, are per SP800-38D recommendations.
// SP800-38D specifies that the nonce must be at least one bit, but we operate on bytes,
// so the minimum is one byte.
//
if( cbNonce != GCM_REQUIRED_NONCE_SIZE )
if( cbNonce < GCM_MIN_NONCE_SIZE )
{
return SYMCRYPT_WRONG_NONCE_SIZE;
}
@ -161,21 +162,8 @@ SymCryptGcmEncryptDecryptPart(
{
bytesToProcess = cbData & SYMCRYPT_GCM_BLOCK_ROUND_MASK;
//
// We use the CTR mode function that increments 64 bits, rather than the 32 bits that GCM requires.
// As we only support 12-byte nonces, the 32-bit counter never overflows, and we can safely use
// the 64-bit incrementing primitive.
// If we ever support other nonce sizes this is going to be a big problem.
// You can't fake a 32-bit counter using a 64-bit counter function without side-channels that expose
// information about the current counter value.
// With other nonce sizes the actual counter value itself is not public, so we can't expose that.
// We can do two things:
// - create SymCryptCtrMsb32
// - Accept that we leak information about the counter value; after all it is not treated as a
// secret when the nonce is 12 bytes.
//
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptCtrMsb64( pState->pKey->pBlockCipher,
SymCryptCtrMsb32( pState->pKey->pBlockCipher,
&pState->pKey->blockcipherKey,
&pState->counterBlock[0],
pbSrc,
@ -192,7 +180,7 @@ SymCryptGcmEncryptDecryptPart(
SymCryptWipeKnownSize( &pState->keystreamBlock[0], SYMCRYPT_GCM_BLOCK_SIZE );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptCtrMsb64( pState->pKey->pBlockCipher,
SymCryptCtrMsb32( pState->pKey->pBlockCipher,
&pState->pKey->blockcipherKey,
&pState->counterBlock[0],
&pState->keystreamBlock[0],
@ -209,7 +197,25 @@ SymCryptGcmEncryptDecryptPart(
}
FORCEINLINE
VOID
SYMCRYPT_CALL
SymCryptGcmResetCounterBlock(
_Inout_ PSYMCRYPT_GCM_STATE pState )
{
// Computing the tag for GCM requires invoking the GCTR function with the pre-counter
// block which was computed when the nonce was set. Historically, we only supported 12-byte
// nonces, so we could trivially reset the counter block by just setting the last 4 bytes to
// (DWORD) 1. With support for larger IVs, the pre-counter block is computed from a GHash of
// the nonce, and we don't store the value. Adding a field in the GCM struct to store the value
// would be ABI-breaking, so instead we can recompute the value by decrementing the last 32 bits
// of the counter block by the number of blocks that have been processed (since the counter is
// incremented once per block), plus one for the initial increment.
UINT32 preCounter32 = SYMCRYPT_LOAD_MSBFIRST32(&pState->counterBlock[12]) -
(UINT32) ((pState->cbData + SYMCRYPT_GCM_BLOCK_SIZE - 1) / SYMCRYPT_GCM_BLOCK_SIZE) - 1;
SYMCRYPT_STORE_MSBFIRST32(&pState->counterBlock[12], preCounter32);
}
VOID
SYMCRYPT_CALL
@ -218,7 +224,6 @@ SymCryptGcmComputeTag(
_Out_writes_( SYMCRYPT_GCM_BLOCK_SIZE ) PBYTE pbTag )
{
SYMCRYPT_ALIGN BYTE buf[2 * SYMCRYPT_GCM_BLOCK_SIZE];
UINT64 cntLow;
SYMCRYPT_STORE_MSBFIRST64( &buf[16], pState->cbAuthData * 8 );
SYMCRYPT_STORE_MSBFIRST64( &buf[24], pState->cbData * 8 );
@ -238,19 +243,7 @@ SymCryptGcmComputeTag(
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, &buf[16], SYMCRYPT_GCM_BLOCK_SIZE );
}
//
// Set up the correct counter block value
// This is a bit tricky. Normally all we have to do is set the last
// 4 bytes to 00000001. But if the message is 2^36-32 bytes then
// our use of a 64-bit incrementing CTR function has incremented bytes
// 8-11 of the nonce. (The 4-byte counter has overflowed, and
// the carry went into the next byte(s).)
// We resolve this by decrementing the 8-byte value first,
// and then setting the proper bits.
//
cntLow = SYMCRYPT_LOAD_MSBFIRST64( &pState->counterBlock[8] );
cntLow = ((cntLow - 1) & 0xffffffff00000000) | 1;
SYMCRYPT_STORE_MSBFIRST64( &pState->counterBlock[8], cntLow );
SymCryptGcmResetCounterBlock(pState);
//
// Convert the GHash state to an array of bytes
@ -259,7 +252,7 @@ SymCryptGcmComputeTag(
SYMCRYPT_STORE_MSBFIRST64( &buf[8], pState->ghashState.ull[0] );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptCtrMsb64( pState->pKey->pBlockCipher,
SymCryptCtrMsb32( pState->pKey->pBlockCipher,
&pState->pKey->blockcipherKey,
&pState->counterBlock[0],
buf,
@ -269,7 +262,6 @@ SymCryptGcmComputeTag(
SymCryptWipeKnownSize( buf, sizeof( buf ) );
}
SYMCRYPT_NOINLINE
SYMCRYPT_ERROR
SYMCRYPT_CALL
@ -337,7 +329,58 @@ SymCryptGcmKeyCopy( _In_ PCSYMCRYPT_GCM_EXPANDED_KEY pSrc, _Out_ PSYMCRYPT_GCM_E
SYMCRYPT_ASSERT( status == SYMCRYPT_NO_ERROR );
}
VOID
SYMCRYPT_CALL
SymCryptGcmSetNonce(
_Out_ PSYMCRYPT_GCM_STATE pState,
_In_reads_( cbNonce ) PCBYTE pbNonce,
SIZE_T cbNonce )
{
SYMCRYPT_ASSERT( cbNonce >= GCM_MIN_NONCE_SIZE );
// Handle the nonce depending on its size, as specified in NIST SP800-38D
if( cbNonce == 12 )
{
// If len(nonce) = 96 bits (12 bytes), pre-counter block = nonce || (DWORD) 1
memcpy( &pState->counterBlock[0], pbNonce, cbNonce );
SymCryptWipeKnownSize( &pState->counterBlock[12], 4 );
pState->counterBlock[15] = 1;
}
else
{
// If len(nonce) != 96 bits (12 bytes),
// pre-counter block = GHASH(nonce padded to a multiple of 128 bits || (QWORD) len(nonce))
BYTE buf[SYMCRYPT_GF128_BLOCK_SIZE];
SIZE_T cbNonceRemainder = cbNonce & 0xf;
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, pbNonce,
cbNonce - cbNonceRemainder );
// If the nonce length is not a multiple of 128 bits, it needs to be padded with zeros
// until it is, as GHASH is only defined on multiples of 128 bits.
if(cbNonceRemainder > 0)
{
SymCryptWipeKnownSize( buf, sizeof(buf) );
memcpy(buf, pbNonce + cbNonce - cbNonceRemainder, cbNonceRemainder);
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, buf, sizeof(buf) );
}
// Now we append the length of the nonce in bits. We take the length as a 64-bit integer,
// but it too must be padded to 128 bits for use in GHASH.
SymCryptWipeKnownSize( buf, 8 );
SYMCRYPT_STORE_MSBFIRST64( &buf[8], cbNonce * 8 );
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, buf, sizeof(buf) );
SymCryptGHashResult( &pState->ghashState, pState->counterBlock );
SymCryptWipeKnownSize( &pState->ghashState, sizeof( pState->ghashState ) );
}
// Increment the last 32 bits of the counter. We'll recalculate the pre-counter block later
// when computing the tag.
SYMCRYPT_STORE_MSBFIRST32(
&pState->counterBlock[12],
1 + SYMCRYPT_LOAD_MSBFIRST32( &pState->counterBlock[12] ) );
}
SYMCRYPT_NOINLINE
VOID
@ -350,8 +393,6 @@ SymCryptGcmInit(
{
UNREFERENCED_PARAMETER( cbNonce ); // It is used in an ASSERT, but only in CHKed builds.
SYMCRYPT_ASSERT( cbNonce == GCM_REQUIRED_NONCE_SIZE );
SYMCRYPT_CHECK_MAGIC( pExpandedKey );
pState->pKey = pExpandedKey;
@ -360,12 +401,7 @@ SymCryptGcmInit(
pState->bytesInMacBlock = 0;
SymCryptWipeKnownSize( &pState->ghashState, sizeof( pState->ghashState ) );
//
// Set up the counter block value
//
memcpy( &pState->counterBlock[0], pbNonce, GCM_REQUIRED_NONCE_SIZE );
SymCryptWipeKnownSize( &pState->counterBlock[12], 4 );
pState->counterBlock[15] = 2;
SymCryptGcmSetNonce(pState, pbNonce, cbNonce);
SYMCRYPT_SET_MAGIC( pState );
}
@ -591,12 +627,11 @@ SymCryptGcmEncrypt(
SYMCRYPT_ALIGN BYTE buf[2 * SYMCRYPT_GCM_BLOCK_SIZE];
SYMCRYPT_GCM_STATE state;
PSYMCRYPT_GCM_STATE pState = &state;
UINT64 cntLow;
// SymCryptGcmInit( &state, pExpandedKey, pbNonce, cbNonce );
UNREFERENCED_PARAMETER( cbNonce ); // It is used in an ASSERT, but only in CHKed builds.
SYMCRYPT_ASSERT( cbNonce == GCM_REQUIRED_NONCE_SIZE );
SYMCRYPT_ASSERT( cbNonce >= GCM_MIN_NONCE_SIZE );
SYMCRYPT_ASSERT( cbTag >= GCM_MIN_TAG_SIZE && cbTag <= GCM_MAX_TAG_SIZE );
SYMCRYPT_CHECK_MAGIC( pExpandedKey );
@ -607,14 +642,7 @@ SymCryptGcmEncrypt(
pState->bytesInMacBlock = 0;
SymCryptWipeKnownSize( &pState->ghashState, sizeof( pState->ghashState ) );
memcpy( &pState->counterBlock[0], pbNonce, GCM_REQUIRED_NONCE_SIZE );
SymCryptWipeKnownSize( &pState->counterBlock[12], 4 );
pState->counterBlock[15] = 1;
// Keep cntLow (for encrypting the tag) for later
cntLow = *((PUINT64) &pState->counterBlock[8]);
pState->counterBlock[15] = 2;
SymCryptGcmSetNonce( pState, pbNonce, cbNonce );
// SymCryptGcmAuthPart( &state, pbAuthData, cbAuthData );
pState->cbAuthData += cbAuthData;
@ -670,7 +698,8 @@ SymCryptGcmEncrypt(
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, &buf[16], SYMCRYPT_GCM_BLOCK_SIZE );
}
*((PUINT64) &pState->counterBlock[8]) = cntLow;
// Reset the counter block prior to computing the tag
SymCryptGcmResetCounterBlock( pState );
//
// Convert the GHash state to an array of bytes
@ -679,7 +708,7 @@ SymCryptGcmEncrypt(
SYMCRYPT_STORE_MSBFIRST64( &buf[8], pState->ghashState.ull[0] );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptCtrMsb64( pState->pKey->pBlockCipher,
SymCryptCtrMsb32( pState->pKey->pBlockCipher,
&pState->pKey->blockcipherKey,
&pState->counterBlock[0],
buf,
@ -712,12 +741,11 @@ SymCryptGcmDecrypt(
SYMCRYPT_ALIGN BYTE buf[2 * SYMCRYPT_GCM_BLOCK_SIZE];
SYMCRYPT_GCM_STATE state;
PSYMCRYPT_GCM_STATE pState = &state;
UINT64 cntLow;
// SymCryptGcmInit( &state, pExpandedKey, pbNonce, cbNonce );
UNREFERENCED_PARAMETER( cbNonce ); // It is used in an ASSERT, but only in CHKed builds.
SYMCRYPT_ASSERT( cbNonce == GCM_REQUIRED_NONCE_SIZE );
SYMCRYPT_ASSERT( cbNonce >= GCM_MIN_NONCE_SIZE );
SYMCRYPT_ASSERT( cbTag >= GCM_MIN_TAG_SIZE && cbTag <= GCM_MAX_TAG_SIZE );
SYMCRYPT_CHECK_MAGIC( pExpandedKey );
@ -728,13 +756,7 @@ SymCryptGcmDecrypt(
pState->bytesInMacBlock = 0;
SymCryptWipeKnownSize( &pState->ghashState, sizeof( pState->ghashState ) );
memcpy( &pState->counterBlock[0], pbNonce, GCM_REQUIRED_NONCE_SIZE );
SymCryptWipeKnownSize( &pState->counterBlock[12], 4 );
pState->counterBlock[15] = 1;
// Keep cntLow (for encrypting the tag) for later
cntLow = *((PUINT64) &pState->counterBlock[8]);
pState->counterBlock[15] = 2;
SymCryptGcmSetNonce( pState, pbNonce, cbNonce );
// SymCryptGcmAuthPart( &state, pbAuthData, cbAuthData );
pState->cbAuthData += cbAuthData;
@ -790,7 +812,7 @@ SymCryptGcmDecrypt(
SymCryptGHashAppendData( &pState->pKey->ghashKey, &pState->ghashState, &buf[16], SYMCRYPT_GCM_BLOCK_SIZE );
}
*((PUINT64) &pState->counterBlock[8]) = cntLow;
SymCryptGcmResetCounterBlock( pState );
//
// Convert the GHash state to an array of bytes
@ -799,7 +821,7 @@ SymCryptGcmDecrypt(
SYMCRYPT_STORE_MSBFIRST64( &buf[8], pState->ghashState.ull[0] );
SYMCRYPT_ASSERT( pState->pKey->pBlockCipher->blockSize == SYMCRYPT_GCM_BLOCK_SIZE );
SymCryptCtrMsb64( pState->pKey->pBlockCipher,
SymCryptCtrMsb32( pState->pKey->pBlockCipher,
&pState->pKey->blockcipherKey,
&pState->counterBlock[0],
buf,

Просмотреть файл

@ -59,11 +59,14 @@ SymCryptGHashExpandKeyC(
VOID
SYMCRYPT_CALL
SymCryptGHashAppendDataC(
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
_In_ SIZE_T cbData )
_In_reads_( SYMCRYPT_GF128_FIELD_SIZE ) PCSYMCRYPT_GF128_ELEMENT expandedKeyTable,
_Inout_ PSYMCRYPT_GF128_ELEMENT pState,
_In_reads_( cbData ) PCBYTE pbData,
_In_range_( SYMCRYPT_GF128_BLOCK_SIZE, SIZE_T_MAX & ~0xf) SIZE_T cbData )
{
SYMCRYPT_ASSERT(cbData >= SYMCRYPT_GF128_BLOCK_SIZE);
SYMCRYPT_ASSERT((cbData & 0xf) == 0);
UINT64 R0, R1;
UINT64 mask;
SYMCRYPT_ALIGN UINT32 state32[4];

Просмотреть файл

@ -1407,6 +1407,24 @@ SymCryptAesCtrMsb64Neon(
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb32Xmm(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb32Neon(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
VOID
SYMCRYPT_CALL
SymCryptXtsAesEncryptDataUnitC(
@ -1645,6 +1663,40 @@ SymCryptGcmDecryptPartTwoPass(
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
VOID
SYMCRYPT_CALL
SymCryptCtrMsb32(
_In_ PCSYMCRYPT_BLOCKCIPHER pBlockCipher,
_In_ PCVOID pExpandedKey,
_Inout_updates_( pBlockCipher->blockSize )
PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
//
// SymCryptCtrMsb32 implements the CTR cipher mode with a 32-bit increment function.
// It is not intended to be used as-is, rather it is a building block for modes like GCM.
// See the description of SymCryptCtrMsb64 in symcrypt.h for more details.
//
// For now, this function is only intended for use with GCM, which specifies the use a
// 32-bit increment function. It's only used in cases where we can't use one of the optimized
// implementations (i.e. on ARM32 or x86[-64] without AESNI). Therefore, unlike the 64-bit version,
// there are no optimized implementations of the CTR function to call. If we ever need this
// functionality for other block cipher modes, this function will need to be updated and we'll
// need to add an additional pointer to SYMCRYPT_BLOCKCIPHER for the optimized CTR function.
VOID
SYMCRYPT_CALL
SymCryptAesCtrMsb32(
_In_ PCSYMCRYPT_AES_EXPANDED_KEY pExpandedKey,
_Inout_updates_( SYMCRYPT_AES_BLOCK_SIZE ) PBYTE pbChainingValue,
_In_reads_( cbData ) PCBYTE pbSrc,
_Out_writes_( cbData ) PBYTE pbDst,
SIZE_T cbData );
// SymCryptAesCtrMsb32 is a dispatch function for the optimized AES CTR implementations that use
//a 32-bit counter function (currently only relevant to GCM).
SYMCRYPT_ERROR
SYMCRYPT_CALL
SymCryptParallelHashProcess_serial(

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -344,6 +344,13 @@ AuthEncImp<ImpXxx, AlgXxx, ModeXxx>::encrypt(
return status;
}
auto nonceSizes = getNonceSizes();
if( nonceSizes.find( cbNonce ) == nonceSizes.end() )
{
status = STATUS_NOT_SUPPORTED;
return status;
}
if( !state.inComputation )
{
// Only init the authInfo if we are starting a new computation
@ -385,7 +392,6 @@ AuthEncImp<ImpXxx, AlgXxx, ModeXxx>::encrypt(
return status;
}
template<>
NTSTATUS
AuthEncImp<ImpXxx, AlgXxx, ModeXxx>::decrypt(
@ -408,20 +414,23 @@ AuthEncImp<ImpXxx, AlgXxx, ModeXxx>::decrypt(
if( flags != 0 )
{
status = STATUS_NOT_SUPPORTED;
goto cleanup;
return status;
}
auto nonceSizes = getNonceSizes();
if( nonceSizes.find( cbNonce ) == nonceSizes.end() )
{
status = STATUS_NOT_SUPPORTED;
return status;
}
authInfo.pbNonce = (PBYTE) pbNonce;
authInfo.cbNonce = (ULONG) cbNonce;
authInfo.pbAuthData = (PBYTE) pbAuthData;
authInfo.cbAuthData = (ULONG) cbAuthData;
authInfo.pbTag = (PBYTE) pbTag;
authInfo.cbTag = (ULONG) cbTag;
//authInfo.pbMacContext = NULL;
//authInfo.cbMacContext = 0;
//authInfo.cbAAD = 0;
//authInfo.cbData = 0;
//authInfo.dwFlags = 0;
ULONG res;
status = CngDecryptFn( state.hKey, (PBYTE) pbSrc, (ULONG) cbData, &authInfo, NULL, 0, pbDst, (ULONG) cbData, &res, 0 );
@ -438,7 +447,6 @@ AuthEncImp<ImpXxx, AlgXxx, ModeXxx>::decrypt(
CHECK( res == cbData, "?" );
}
cleanup:
return status;
}

Просмотреть файл

@ -2945,6 +2945,12 @@ AuthEncImp<ImpRsa32, AlgAes, ModeGcm>::encrypt(
goto cleanup;
}
if( cbNonce != 12 )
{
status = STATUS_NOT_SUPPORTED;
goto cleanup;
}
status = AesGcm ( &state.key,
NULL,
16,
@ -2953,6 +2959,7 @@ AuthEncImp<ImpRsa32, AlgAes, ModeGcm>::encrypt(
(PBYTE) pbAuthData, (ULONG) cbAuthData,
pbTag, (ULONG) cbTag,
ENCRYPT );
CHECK( NT_SUCCESS( status ), "GCM encrypt failure" );
cleanup:
@ -2982,6 +2989,12 @@ AuthEncImp<ImpRsa32, AlgAes, ModeGcm>::decrypt(
goto cleanup;
}
if( cbNonce != 12 )
{
status = STATUS_NOT_SUPPORTED;
goto cleanup;
}
status = AesGcm ( &state.key,
NULL,
16,

Просмотреть файл

@ -874,7 +874,10 @@ AuthEncImp<ImpXxx, AlgAes, ModeGcm>::getNonceSizes()
{
std::set<SIZE_T> res;
res.insert( 12 );
for(int i = 1; i <= 256; ++i)
{
res.insert( i );
}
return res;
}

Просмотреть файл

@ -270,7 +270,7 @@ katAuthEncSingle(
SIZE_T cbTag,
ULONGLONG line)
{
BYTE bufData[512];
BYTE bufData[4096];
BYTE bufTag[32];
NTSTATUS status;
@ -333,24 +333,27 @@ testAuthEncRandom( AuthEncMultiImp * pImp, int rrep, PCBYTE pbResult, SIZE_T cbR
std::sort( nonceSizes.begin(), nonceSizes.end() );
std::sort( tagSizes.begin(), tagSizes.end() );
//iprint( "# sizes: %d, %d, %d\n", keySizes.size(), nonceSizes.size(), tagSizes.size() );
//
//for( int i=0; i< tagSizes.size(); i++ )
//{
// iprint( "tag %d: %d\n", i, tagSizes[i] );
//}
memset( buf, 0, sizeof( buf ) );
for( int i=0; i<rrep; i++ )
{
SIZE_T cbKey = keySizes[ rng.sizet( keySizes.size() )];
SIZE_T cbNonce = nonceSizes[ rng.sizet( nonceSizes.size() )];
SIZE_T cbTag = tagSizes[ rng.sizet( tagSizes.size() )];
// Kludge: previous implementations of AES-GCM only support 12-byte nonces. That doesn't
// block this test case, as those implementations will just be ignored if a different
// nonce size is used, but since the nonce size is abitrary for GCM, if we choose
// randomly we will rarely get a 12-byte nonce, meaning that we'll have few results to
// compare against. So, we force a 12-byte nonce 50% of the time.
if( pImp->m_algorithmName == "AesGcm" && (rng.byte() & 1) )
{
cbNonce = 12;
}
CHECK( cbKey <= bufSize && cbNonce <= bufSize && cbTag <= sizeof( tagBuf ), "??" );
SIZE_T keyIdx = rng.sizet( bufSize - cbKey );
SIZE_T nonceIdx = rng.sizet( bufSize - cbNonce );
SIZE_T tagIdx = rng.sizet( bufSize - cbTag );
@ -363,6 +366,8 @@ testAuthEncRandom( AuthEncMultiImp * pImp, int rrep, PCBYTE pbResult, SIZE_T cbR
rng.randomSubRange( bufSize, &srcIdx, &cbData );
SIZE_T dstIdx = rng.sizet( bufSize - cbData );
pImp->setKey( &buf[keyIdx], cbKey );
pImp->encrypt( &buf[nonceIdx], cbNonce,
&buf[authDataIdx], cbAuthData,
&buf[srcIdx], tmp1, cbData,