diff --git a/ssh.h b/ssh.h index 3a2e0ece..1e732d78 100644 --- a/ssh.h +++ b/ssh.h @@ -558,8 +558,6 @@ struct ssh_cipher { const ssh_cipheralg *vt; }; -bool supports_sha_ni(void); - struct ssh_cipheralg { ssh_cipher *(*new)(const ssh_cipheralg *alg); void (*free)(ssh_cipher *); @@ -819,7 +817,11 @@ extern const ssh2_ciphers ssh2_arcfour; extern const ssh2_ciphers ssh2_ccp; extern const ssh_hashalg ssh_md5; extern const ssh_hashalg ssh_sha1; +extern const ssh_hashalg ssh_sha1_hw; +extern const ssh_hashalg ssh_sha1_sw; extern const ssh_hashalg ssh_sha256; +extern const ssh_hashalg ssh_sha256_hw; +extern const ssh_hashalg ssh_sha256_sw; extern const ssh_hashalg ssh_sha384; extern const ssh_hashalg ssh_sha512; extern const ssh_kexes ssh_diffiehellman_group1; @@ -867,29 +869,6 @@ extern const char sshver[]; */ extern bool ssh_fallback_cmd(Backend *backend); -/* - * Check of compiler version - */ -#ifdef _FORCE_SHA_NI -# define COMPILER_SUPPORTS_SHA_NI -#elif defined(__clang__) -# if __has_attribute(target) && __has_include() && (defined(__x86_64__) || defined(__i386)) -# define COMPILER_SUPPORTS_SHA_NI -# endif -#elif defined(__GNUC__) -# if ((__GNUC__ >= 5) && (defined(__x86_64__) || defined(__i386))) -# define COMPILER_SUPPORTS_SHA_NI -# endif -#elif defined (_MSC_VER) -# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_VER >= 1900 -# define COMPILER_SUPPORTS_SHA_NI -# endif -#endif - -#ifdef _FORCE_SOFTWARE_SHA -# undef COMPILER_SUPPORTS_SHA_NI -#endif - /* * The PRNG type, defined in sshprng.c. Visible data fields are * 'savesize', which suggests how many random bytes you should request diff --git a/sshsh256.c b/sshsh256.c index fbd8bdb2..11facbf8 100644 --- a/sshsh256.c +++ b/sshsh256.c @@ -7,259 +7,302 @@ #include "ssh.h" #include -/* ---------------------------------------------------------------------- - * Core SHA256 algorithm: processes 16-word blocks into a message digest. +/* + * Start by deciding whether we can support hardware SHA at all. */ +#define HW_SHA256_NONE 0 +#define HW_SHA256_NI 1 -#define ror(x,y) ( ((x) << (32-y)) | (((uint32_t)(x)) >> (y)) ) -#define shr(x,y) ( (((uint32_t)(x)) >> (y)) ) -#define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) ) -#define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) ) -#define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) ) -#define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) ) -#define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) ) -#define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) ) +#ifdef _FORCE_SHA_NI +# define HW_SHA256 HW_SHA256_NI +#elif defined(__clang__) +# if __has_attribute(target) && __has_include() && \ + (defined(__x86_64__) || defined(__i386)) +# define HW_SHA256 HW_SHA256_NI +# endif +#elif defined(__GNUC__) +# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \ + (defined(__x86_64__) || defined(__i386)) +# define HW_SHA256 HW_SHA256_NI +# endif +#elif defined (_MSC_VER) +# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729 +# define HW_SHA256 HW_SHA256_NI +# endif +#endif -typedef struct SHA256_State { - uint32_t h[8]; - unsigned char block[64]; - int blkused; - uint64_t len; - void (*sha256)(struct SHA256_State * s, const unsigned char *p, int len); - BinarySink_IMPLEMENTATION; -} SHA256_State; - -static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len); -static void SHA256_ni(SHA256_State *s, const unsigned char *q, int len); - -void SHA256_Core_Init(SHA256_State *s) { - s->h[0] = 0x6a09e667; - s->h[1] = 0xbb67ae85; - s->h[2] = 0x3c6ef372; - s->h[3] = 0xa54ff53a; - s->h[4] = 0x510e527f; - s->h[5] = 0x9b05688c; - s->h[6] = 0x1f83d9ab; - s->h[7] = 0x5be0cd19; -} - -void SHA256_Block(SHA256_State *s, uint32_t *block) { - uint32_t w[80]; - uint32_t a,b,c,d,e,f,g,h; - static const int k[] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, - }; - - int t; - - for (t = 0; t < 16; t++) - w[t] = block[t]; - - for (t = 16; t < 64; t++) - w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16]; - - a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3]; - e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7]; - - for (t = 0; t < 64; t+=8) { - uint32_t t1, t2; - -#define ROUND(j,a,b,c,d,e,f,g,h) \ - t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \ - t2 = bigsigma0(a) + Maj(a,b,c); \ - d = d + t1; h = t1 + t2; - - ROUND(t+0, a,b,c,d,e,f,g,h); - ROUND(t+1, h,a,b,c,d,e,f,g); - ROUND(t+2, g,h,a,b,c,d,e,f); - ROUND(t+3, f,g,h,a,b,c,d,e); - ROUND(t+4, e,f,g,h,a,b,c,d); - ROUND(t+5, d,e,f,g,h,a,b,c); - ROUND(t+6, c,d,e,f,g,h,a,b); - ROUND(t+7, b,c,d,e,f,g,h,a); - } - - s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d; - s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h; -} - -/* ---------------------------------------------------------------------- - * Outer SHA256 algorithm: take an arbitrary length byte string, - * convert it into 16-word blocks with the prescribed padding at - * the end, and pass those blocks to the core SHA256 algorithm. - */ - -#define BLKSIZE 64 - -static void SHA256_BinarySink_write(BinarySink *bs, - const void *p, size_t len); - -void SHA256_Init(SHA256_State *s) { - SHA256_Core_Init(s); - s->blkused = 0; - s->len = 0; - if (supports_sha_ni()) - s->sha256 = &SHA256_ni; - else - s->sha256 = &SHA256_sw; - BinarySink_INIT(s, SHA256_BinarySink_write); -} - -static void SHA256_BinarySink_write(BinarySink *bs, - const void *p, size_t len) -{ - struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State); - unsigned char *q = (unsigned char *)p; - - /* - * Update the length field. - */ - s->len += len; - - (*(s->sha256))(s, q, len); -} - -static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) { - uint32_t wordblock[16]; - int i; - - if (s->blkused && s->blkused+len < BLKSIZE) { - /* - * Trivial case: just add to the block. - */ - memcpy(s->block + s->blkused, q, len); - s->blkused += len; - } else { - /* - * We must complete and process at least one block. - */ - while (s->blkused + len >= BLKSIZE) { - memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused); - q += BLKSIZE - s->blkused; - len -= BLKSIZE - s->blkused; - /* Now process the block. Gather bytes big-endian into words */ - for (i = 0; i < 16; i++) { - wordblock[i] = - ( ((uint32_t)s->block[i*4+0]) << 24 ) | - ( ((uint32_t)s->block[i*4+1]) << 16 ) | - ( ((uint32_t)s->block[i*4+2]) << 8 ) | - ( ((uint32_t)s->block[i*4+3]) << 0 ); - } - SHA256_Block(s, wordblock); - s->blkused = 0; - } - memcpy(s->block, q, len); - s->blkused = len; - } -} - -void SHA256_Final(SHA256_State *s, unsigned char *digest) { - int i; - int pad; - unsigned char c[64]; - uint64_t len; - - if (s->blkused >= 56) - pad = 56 + 64 - s->blkused; - else - pad = 56 - s->blkused; - - len = (s->len << 3); - - memset(c, 0, pad); - c[0] = 0x80; - put_data(s, &c, pad); - - put_uint64(s, len); - - for (i = 0; i < 8; i++) { - digest[i*4+0] = (s->h[i] >> 24) & 0xFF; - digest[i*4+1] = (s->h[i] >> 16) & 0xFF; - digest[i*4+2] = (s->h[i] >> 8) & 0xFF; - digest[i*4+3] = (s->h[i] >> 0) & 0xFF; - } -} - -void SHA256_Simple(const void *p, int len, unsigned char *output) { - SHA256_State s; - - SHA256_Init(&s); - put_data(&s, p, len); - SHA256_Final(&s, output); - smemclr(&s, sizeof(s)); -} +#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256 +# undef HW_SHA256 +# define HW_SHA256 HW_SHA256_NONE +#endif /* - * Thin abstraction for things where hashes are pluggable. + * The actual query function that asks if hardware acceleration is + * available. */ +static bool sha256_hw_available(void); -struct sha256_hash { - SHA256_State state; - ssh_hash hash; -}; - -static ssh_hash *sha256_new(const ssh_hashalg *alg) +/* + * The top-level selection function, caching the results of + * sha256_hw_available() so it only has to run once. + */ +static bool sha256_hw_available_cached(void) { - struct sha256_hash *h = snew(struct sha256_hash); - SHA256_Init(&h->state); - h->hash.vt = alg; - BinarySink_DELEGATE_INIT(&h->hash, &h->state); - return &h->hash; + static bool initialised = false; + static bool hw_available; + if (!initialised) { + hw_available = sha256_hw_available(); + initialised = true; + } + return hw_available; } -static ssh_hash *sha256_copy(ssh_hash *hashold) +static ssh_hash *sha256_select(const ssh_hashalg *alg) { - struct sha256_hash *hold, *hnew; - ssh_hash *hashnew = sha256_new(hashold->vt); + const ssh_hashalg *real_alg = + sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw; - hold = container_of(hashold, struct sha256_hash, hash); - hnew = container_of(hashnew, struct sha256_hash, hash); - - hnew->state = hold->state; - BinarySink_COPIED(&hnew->state); - - return hashnew; -} - -static void sha256_free(ssh_hash *hash) -{ - struct sha256_hash *h = container_of(hash, struct sha256_hash, hash); - - smemclr(h, sizeof(*h)); - sfree(h); -} - -static void sha256_final(ssh_hash *hash, unsigned char *output) -{ - struct sha256_hash *h = container_of(hash, struct sha256_hash, hash); - SHA256_Final(&h->state, output); - sha256_free(hash); + return ssh_hash_new(real_alg); } const ssh_hashalg ssh_sha256 = { - sha256_new, sha256_copy, sha256_final, sha256_free, 32, 64, "SHA-256" + sha256_select, NULL, NULL, NULL, + 32, 64, "SHA-256", }; -#ifdef COMPILER_SUPPORTS_SHA_NI +/* ---------------------------------------------------------------------- + * Definitions likely to be helpful to multiple implementations. + */ -#if defined _MSC_VER && defined _M_AMD64 -# include -#endif +static const uint32_t sha256_initial_state[] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19, +}; + +static const uint32_t sha256_round_constants[] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +#define SHA256_ROUNDS 64 + +typedef struct sha256_block sha256_block; +struct sha256_block { + uint8_t block[64]; + size_t used; + uint64_t len; +}; + +static inline void sha256_block_setup(sha256_block *blk) +{ + blk->used = 0; + blk->len = 0; +} + +static inline bool sha256_block_write( + sha256_block *blk, const void **vdata, size_t *len) +{ + size_t blkleft = sizeof(blk->block) - blk->used; + size_t chunk = *len < blkleft ? *len : blkleft; + + const uint8_t *p = *vdata; + memcpy(blk->block + blk->used, p, chunk); + *vdata = p + chunk; + *len -= chunk; + blk->used += chunk; + blk->len += chunk; + + if (blk->used == sizeof(blk->block)) { + blk->used = 0; + return true; + } + + return false; +} + +static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs) +{ + uint64_t final_len = blk->len << 3; + size_t pad = 1 + (63 & (55 - blk->used)); + + put_byte(bs, 0x80); + for (size_t i = 1; i < pad; i++) + put_byte(bs, 0); + put_uint64(bs, final_len); + + assert(blk->used == 0 && "Should have exactly hit a block boundary"); +} + +/* ---------------------------------------------------------------------- + * Software implementation of SHA-256. + */ + +static inline uint32_t ror(uint32_t x, unsigned y) +{ + return (x << (31 & -y)) | (x >> (31 & y)); +} + +static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) +{ + return if0 ^ (ctrl & (if1 ^ if0)); +} + +static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & y) | (z & (x | y)); +} + +static inline uint32_t Sigma_0(uint32_t x) +{ + return ror(x,2) ^ ror(x,13) ^ ror(x,22); +} + +static inline uint32_t Sigma_1(uint32_t x) +{ + return ror(x,6) ^ ror(x,11) ^ ror(x,25); +} + +static inline uint32_t sigma_0(uint32_t x) +{ + return ror(x,7) ^ ror(x,18) ^ (x >> 3); +} + +static inline uint32_t sigma_1(uint32_t x) +{ + return ror(x,17) ^ ror(x,19) ^ (x >> 10); +} + +static inline void sha256_sw_round( + unsigned round_index, const uint32_t *schedule, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, + uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h) +{ + uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) + + sha256_round_constants[round_index] + schedule[round_index]; + + uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c); + + *d += t1; + *h = t1 + t2; +} + +static void sha256_sw_block(uint32_t *core, const uint8_t *block) +{ + uint32_t w[SHA256_ROUNDS]; + uint32_t a,b,c,d,e,f,g,h; + + for (size_t t = 0; t < 16; t++) + w[t] = GET_32BIT_MSB_FIRST(block + 4*t); + + for (size_t t = 16; t < SHA256_ROUNDS; t++) + w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16]; + + a = core[0]; b = core[1]; c = core[2]; d = core[3]; + e = core[4]; f = core[5]; g = core[6]; h = core[7]; + + for (size_t t = 0; t < SHA256_ROUNDS; t += 8) { + sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h); + sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g); + sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f); + sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e); + sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d); + sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c); + sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b); + sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a); + } + + core[0] += a; core[1] += b; core[2] += c; core[3] += d; + core[4] += e; core[5] += f; core[6] += g; core[7] += h; + + smemclr(w, sizeof(w)); +} + +typedef struct sha256_sw { + uint32_t core[8]; + sha256_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_sw; + +static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha256_sw_new(const ssh_hashalg *alg) +{ + sha256_sw *s = snew(sha256_sw); + + memcpy(s->core, sha256_initial_state, sizeof(s->core)); + + sha256_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_sw_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha256_sw_copy(ssh_hash *hash) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + sha256_sw *copy = snew(sha256_sw); + + memcpy(copy, s, sizeof(*copy)); + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha256_sw_free(ssh_hash *hash) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_sw_block(s->core, s->blk.block); +} + +static void sha256_sw_final(ssh_hash *hash, uint8_t *digest) +{ + sha256_sw *s = container_of(hash, sha256_sw, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + for (size_t i = 0; i < 8; i++) + PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); + sha256_sw_free(hash); +} + +const ssh_hashalg ssh_sha256_sw = { + sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free, + 32, 64, "SHA-256", +}; + +/* ---------------------------------------------------------------------- + * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI. + */ + +#if HW_SHA256 == HW_SHA256_NI /* * Set target architecture for Clang and GCC @@ -269,7 +312,7 @@ const ssh_hashalg ssh_sha256 = { # pragma GCC target("sse4.1") #endif -#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) +#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) # define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) #else # define FUNC_ISA @@ -278,236 +321,369 @@ const ssh_hashalg ssh_sha256 = { #include #include #include - #if defined(__clang__) || defined(__GNUC__) #include #endif +#if defined(__clang__) || defined(__GNUC__) +#include +#define GET_CPU_ID_0(out) \ + __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) +#define GET_CPU_ID_7(out) \ + __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID_0(out) __cpuid(out, 0) +#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) +#endif + +static bool sha256_hw_available(void) +{ + unsigned int CPUInfo[4]; + GET_CPU_ID_0(CPUInfo); + if (CPUInfo[0] < 7) + return false; + + GET_CPU_ID_7(CPUInfo); + return CPUInfo[1] & (1 << 29); /* Check SHA */ +} + /* SHA256 implementation using new instructions The code is based on Jeffrey Walton's SHA256 implementation: https://github.com/noloader/SHA-Intrinsics */ FUNC_ISA -static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) { - if (s->blkused && s->blkused+len < BLKSIZE) { - /* - * Trivial case: just add to the block. - */ - memcpy(s->block + s->blkused, q, len); - s->blkused += len; - } else { - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i MSG0, MSG1, MSG2, MSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); +static inline void sha256_ni_block(__m128i *core, const uint8_t *p) +{ + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i MSG0, MSG1, MSG2, MSG3; + const __m128i *block = (const __m128i *)p; + const __m128i MASK = _mm_set_epi64x( + 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - /* Load initial values */ - TMP = _mm_loadu_si128((const __m128i*) &s->h[0]); - STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]); + /* Load initial values */ + STATE0 = core[0]; + STATE1 = core[1]; - TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ - /* - * We must complete and process at least one block. - */ - while (s->blkused + len >= BLKSIZE) { - memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused); - q += BLKSIZE - s->blkused; - len -= BLKSIZE - s->blkused; + /* Rounds 0-3 */ + MSG = _mm_loadu_si128(block); + MSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - /* Save current state */ - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128(block + 1); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - /* Rounds 0-3 */ - MSG = _mm_loadu_si128((const __m128i*) (s->block + 0)); - MSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128(block + 2); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128(block + 3); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + /* Rounds 16-19 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + /* Rounds 20-23 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - /* Rounds 16-19 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + /* Rounds 24-27 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - /* Rounds 20-23 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + /* Rounds 28-31 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - /* Rounds 24-27 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + /* Rounds 32-35 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - /* Rounds 28-31 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + /* Rounds 36-39 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - /* Rounds 32-35 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + /* Rounds 40-43 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - /* Rounds 36-39 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + /* Rounds 44-47 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - /* Rounds 40-43 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + /* Rounds 48-51 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x( + 0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - /* Rounds 44-47 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + /* Rounds 52-55 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x( + 0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - /* Rounds 48-51 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + /* Rounds 56-59 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x( + 0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - /* Rounds 52-55 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + /* Rounds 60-63 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x( + 0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - /* Rounds 56-59 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 60-63 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Combine state */ - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - s->blkused = 0; - } - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ - - /* Save state */ - _mm_storeu_si128((__m128i*) &s->h[0], STATE0); - _mm_storeu_si128((__m128i*) &s->h[4], STATE1); - - memcpy(s->block, q, len); - s->blkused = len; - } + /* Combine state */ + core[0] = _mm_add_epi32(STATE0, core[0]); + core[1] = _mm_add_epi32(STATE1, core[1]); } -/* - * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980 +typedef struct sha256_ni { + /* + * These two vectors store the 8 words of the SHA-256 state, but + * not in the same order they appear in the spec: the first word + * holds A,B,E,F and the second word C,D,G,H. + */ + __m128i core[2]; + sha256_block blk; + void *pointer_to_free; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha256_ni; + +static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len); + +static sha256_ni *sha256_ni_alloc(void) +{ + /* + * The __m128i variables in the context structure need to be + * 16-byte aligned, but not all malloc implementations that this + * code has to work with will guarantee to return a 16-byte + * aligned pointer. So we over-allocate, manually realign the + * pointer ourselves, and store the original one inside the + * context so we know how to free it later. + */ + void *allocation = smalloc(sizeof(sha256_ni) + 15); + uintptr_t alloc_address = (uintptr_t)allocation; + uintptr_t aligned_address = (alloc_address + 15) & ~15; + sha256_ni *s = (sha256_ni *)aligned_address; + s->pointer_to_free = allocation; + return s; +} + +FUNC_ISA static ssh_hash *sha256_ni_new(const ssh_hashalg *alg) +{ + if (!sha256_hw_available_cached()) + return NULL; + + sha256_ni *s = sha256_ni_alloc(); + + /* Initialise the core vectors in their storage order */ + s->core[0] = _mm_set_epi64x( + 0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL); + s->core[1] = _mm_set_epi64x( + 0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL); + + sha256_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha256_ni_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha256_ni_copy(ssh_hash *hash) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + sha256_ni *copy = sha256_ni_alloc(); + + void *ptf_save = copy->pointer_to_free; + *copy = *s; /* structure copy */ + copy->pointer_to_free = ptf_save; + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha256_ni_free(ssh_hash *hash) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + + void *ptf = s->pointer_to_free; + smemclr(s, sizeof(*s)); + sfree(ptf); +} + +static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len) +{ + sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni); + + while (len > 0) + if (sha256_block_write(&s->blk, &vp, &len)) + sha256_ni_block(s->core, s->blk.block); +} + +FUNC_ISA static void sha256_ni_final(ssh_hash *hash, uint8_t *digest) +{ + sha256_ni *s = container_of(hash, sha256_ni, hash); + + sha256_block_pad(&s->blk, BinarySink_UPCAST(s)); + + /* Rearrange the words into the output order */ + __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B); + __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1); + __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0); + __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8); + + /* Byte-swap them into the output endianness */ + const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); + dcba = _mm_shuffle_epi8(dcba, mask); + hgfe = _mm_shuffle_epi8(hgfe, mask); + + /* And store them */ + __m128i *output = (__m128i *)digest; + _mm_storeu_si128(output, dcba); + _mm_storeu_si128(output+1, hgfe); + + sha256_ni_free(hash); +} + +const ssh_hashalg ssh_sha256_hw = { + sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free, + 32, 64, "SHA-256", +}; + +/* ---------------------------------------------------------------------- + * Stub functions if we have no hardware-accelerated SHA-256. In this + * case, sha256_hw_new returns NULL (though it should also never be + * selected by sha256_select, so the only thing that should even be + * _able_ to call it is testcrypt). As a result, the remaining vtable + * functions should never be called at all. */ -static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len) + +#elif HW_SHA256 == HW_SHA256_NONE + +static bool sha256_hw_available(void) { - SHA256_ni_(s, q, len); + return false; } -#else /* COMPILER_SUPPORTS_AES_NI */ - -static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len) +static ssh_hash *sha256_stub_new(const ssh_hashalg *alg) { - unreachable("SHA256_ni not compiled in"); + return NULL; } -#endif /* COMPILER_SUPPORTS_AES_NI */ +#define STUB_BODY { unreachable("Should never be called"); } + +static ssh_hash *sha256_stub_copy(ssh_hash *hash) STUB_BODY +static void sha256_stub_free(ssh_hash *hash) STUB_BODY +static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY + +const ssh_hashalg ssh_sha256_hw = { + sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free, + 32, 64, "SHA-256", +}; + +#endif /* HW_SHA256 */ diff --git a/sshsha.c b/sshsha.c index 82f1c913..c791760c 100644 --- a/sshsha.c +++ b/sshsha.c @@ -1,294 +1,291 @@ /* - * SHA1 hash algorithm. Used in SSH-2 as a MAC, and the transform is - * also used as a `stirring' function for the PuTTY random number - * pool. Implemented directly from the specification by Simon - * Tatham. + * SHA-1 algorithm as described at + * + * http://csrc.nist.gov/cryptval/shs.html */ #include "ssh.h" - #include -typedef struct SHA_State { - uint32_t h[5]; - unsigned char block[64]; - int blkused; - uint64_t len; - void (*sha1)(struct SHA_State * s, const unsigned char *p, int len); - BinarySink_IMPLEMENTATION; -} SHA_State; - -/* ---------------------------------------------------------------------- - * Core SHA algorithm: processes 16-word blocks into a message digest. +/* + * Start by deciding whether we can support hardware SHA at all. */ +#define HW_SHA1_NONE 0 +#define HW_SHA1_NI 1 -#define rol(x,y) ( ((x) << (y)) | (((uint32_t)x) >> (32-y)) ) - -static void sha1_sw(SHA_State * s, const unsigned char *q, int len); -static void sha1_ni(SHA_State * s, const unsigned char *q, int len); - -static void SHA_Core_Init(uint32_t h[5]) -{ - h[0] = 0x67452301; - h[1] = 0xefcdab89; - h[2] = 0x98badcfe; - h[3] = 0x10325476; - h[4] = 0xc3d2e1f0; -} - -void SHATransform(uint32_t * digest, uint32_t * block) -{ - uint32_t w[80]; - uint32_t a, b, c, d, e; - int t; - -#ifdef RANDOM_DIAGNOSTICS - { - extern int random_diagnostics; - if (random_diagnostics) { - int i; - printf("SHATransform:"); - for (i = 0; i < 5; i++) - printf(" %08x", digest[i]); - printf(" +"); - for (i = 0; i < 16; i++) - printf(" %08x", block[i]); - } - } +#ifdef _FORCE_SHA_NI +# define HW_SHA1 HW_SHA1_NI +#elif defined(__clang__) +# if __has_attribute(target) && __has_include() && \ + (defined(__x86_64__) || defined(__i386)) +# define HW_SHA1 HW_SHA1_NI +# endif +#elif defined(__GNUC__) +# if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \ + (defined(__x86_64__) || defined(__i386)) +# define HW_SHA1 HW_SHA1_NI +# endif +#elif defined (_MSC_VER) +# if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729 +# define HW_SHA1 HW_SHA1_NI +# endif #endif - for (t = 0; t < 16; t++) - w[t] = block[t]; - - for (t = 16; t < 80; t++) { - uint32_t tmp = w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16]; - w[t] = rol(tmp, 1); - } - - a = digest[0]; - b = digest[1]; - c = digest[2]; - d = digest[3]; - e = digest[4]; - - for (t = 0; t < 20; t++) { - uint32_t tmp = - rol(a, 5) + ((b & c) | (d & ~b)) + e + w[t] + 0x5a827999; - e = d; - d = c; - c = rol(b, 30); - b = a; - a = tmp; - } - for (t = 20; t < 40; t++) { - uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0x6ed9eba1; - e = d; - d = c; - c = rol(b, 30); - b = a; - a = tmp; - } - for (t = 40; t < 60; t++) { - uint32_t tmp = rol(a, - 5) + ((b & c) | (b & d) | (c & d)) + e + w[t] + - 0x8f1bbcdc; - e = d; - d = c; - c = rol(b, 30); - b = a; - a = tmp; - } - for (t = 60; t < 80; t++) { - uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0xca62c1d6; - e = d; - d = c; - c = rol(b, 30); - b = a; - a = tmp; - } - - digest[0] += a; - digest[1] += b; - digest[2] += c; - digest[3] += d; - digest[4] += e; - -#ifdef RANDOM_DIAGNOSTICS - { - extern int random_diagnostics; - if (random_diagnostics) { - int i; - printf(" ="); - for (i = 0; i < 5; i++) - printf(" %08x", digest[i]); - printf("\n"); - } - } +#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1 +# undef HW_SHA1 +# define HW_SHA1 HW_SHA1_NONE #endif -} - -/* ---------------------------------------------------------------------- - * Outer SHA algorithm: take an arbitrary length byte string, - * convert it into 16-word blocks with the prescribed padding at - * the end, and pass those blocks to the core SHA algorithm. - */ - -static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len); - -void SHA_Init(SHA_State * s) -{ - SHA_Core_Init(s->h); - s->blkused = 0; - s->len = 0; - if (supports_sha_ni()) - s->sha1 = &sha1_ni; - else - s->sha1 = &sha1_sw; - BinarySink_INIT(s, SHA_BinarySink_write); -} - -static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len) -{ - struct SHA_State *s = BinarySink_DOWNCAST(bs, struct SHA_State); - const unsigned char *q = (const unsigned char *) p; - - /* - * Update the length field. - */ - s->len += len; - - (*(s->sha1))(s, q, len); -} - -static void sha1_sw(SHA_State * s, const unsigned char *q, int len) -{ - uint32_t wordblock[16]; - int i; - - if (s->blkused && s->blkused + len < 64) { - /* - * Trivial case: just add to the block. - */ - memcpy(s->block + s->blkused, q, len); - s->blkused += len; - } else { - /* - * We must complete and process at least one block. - */ - while (s->blkused + len >= 64) { - memcpy(s->block + s->blkused, q, 64 - s->blkused); - q += 64 - s->blkused; - len -= 64 - s->blkused; - /* Now process the block. Gather bytes big-endian into words */ - for (i = 0; i < 16; i++) { - wordblock[i] = - (((uint32_t) s->block[i * 4 + 0]) << 24) | - (((uint32_t) s->block[i * 4 + 1]) << 16) | - (((uint32_t) s->block[i * 4 + 2]) << 8) | - (((uint32_t) s->block[i * 4 + 3]) << 0); - } - SHATransform(s->h, wordblock); - s->blkused = 0; - } - memcpy(s->block, q, len); - s->blkused = len; - } -} - -void SHA_Final(SHA_State * s, unsigned char *output) -{ - int i; - int pad; - unsigned char c[64]; - uint64_t len; - - if (s->blkused >= 56) - pad = 56 + 64 - s->blkused; - else - pad = 56 - s->blkused; - - len = (s->len << 3); - - memset(c, 0, pad); - c[0] = 0x80; - put_data(s, &c, pad); - - put_uint64(s, len); - - for (i = 0; i < 5; i++) { - output[i * 4] = (s->h[i] >> 24) & 0xFF; - output[i * 4 + 1] = (s->h[i] >> 16) & 0xFF; - output[i * 4 + 2] = (s->h[i] >> 8) & 0xFF; - output[i * 4 + 3] = (s->h[i]) & 0xFF; - } -} - -void SHA_Simple(const void *p, int len, unsigned char *output) -{ - SHA_State s; - - SHA_Init(&s); - put_data(&s, p, len); - SHA_Final(&s, output); - smemclr(&s, sizeof(s)); -} /* - * Thin abstraction for things where hashes are pluggable. + * The actual query function that asks if hardware acceleration is + * available. */ +static bool sha1_hw_available(void); -struct sha1_hash { - SHA_State state; - ssh_hash hash; -}; - -static ssh_hash *sha1_new(const ssh_hashalg *alg) +/* + * The top-level selection function, caching the results of + * sha1_hw_available() so it only has to run once. + */ +static bool sha1_hw_available_cached(void) { - struct sha1_hash *h = snew(struct sha1_hash); - SHA_Init(&h->state); - h->hash.vt = alg; - BinarySink_DELEGATE_INIT(&h->hash, &h->state); - return &h->hash; + static bool initialised = false; + static bool hw_available; + if (!initialised) { + hw_available = sha1_hw_available(); + initialised = true; + } + return hw_available; } -static ssh_hash *sha1_copy(ssh_hash *hashold) +static ssh_hash *sha1_select(const ssh_hashalg *alg) { - struct sha1_hash *hold, *hnew; - ssh_hash *hashnew = sha1_new(hashold->vt); + const ssh_hashalg *real_alg = + sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw; - hold = container_of(hashold, struct sha1_hash, hash); - hnew = container_of(hashnew, struct sha1_hash, hash); - - hnew->state = hold->state; - BinarySink_COPIED(&hnew->state); - - return hashnew; -} - -static void sha1_free(ssh_hash *hash) -{ - struct sha1_hash *h = container_of(hash, struct sha1_hash, hash); - - smemclr(h, sizeof(*h)); - sfree(h); -} - -static void sha1_final(ssh_hash *hash, unsigned char *output) -{ - struct sha1_hash *h = container_of(hash, struct sha1_hash, hash); - SHA_Final(&h->state, output); - sha1_free(hash); + return ssh_hash_new(real_alg); } const ssh_hashalg ssh_sha1 = { - sha1_new, sha1_copy, sha1_final, sha1_free, 20, 64, "SHA-1" + sha1_select, NULL, NULL, NULL, + 20, 64, "SHA-1", }; -#ifdef COMPILER_SUPPORTS_SHA_NI +/* ---------------------------------------------------------------------- + * Definitions likely to be helpful to multiple implementations. + */ -#if defined _MSC_VER && defined _M_AMD64 -# include -#endif +static const uint32_t sha1_initial_state[] = { + 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0, +}; + +#define SHA1_ROUNDS_PER_STAGE 20 +#define SHA1_STAGE0_CONSTANT 0x5a827999 +#define SHA1_STAGE1_CONSTANT 0x6ed9eba1 +#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc +#define SHA1_STAGE3_CONSTANT 0xca62c1d6 +#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE) + +typedef struct sha1_block sha1_block; +struct sha1_block { + uint8_t block[64]; + size_t used; + uint64_t len; +}; + +static inline void sha1_block_setup(sha1_block *blk) +{ + blk->used = 0; + blk->len = 0; +} + +static inline bool sha1_block_write( + sha1_block *blk, const void **vdata, size_t *len) +{ + size_t blkleft = sizeof(blk->block) - blk->used; + size_t chunk = *len < blkleft ? *len : blkleft; + + const uint8_t *p = *vdata; + memcpy(blk->block + blk->used, p, chunk); + *vdata = p + chunk; + *len -= chunk; + blk->used += chunk; + blk->len += chunk; + + if (blk->used == sizeof(blk->block)) { + blk->used = 0; + return true; + } + + return false; +} + +static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs) +{ + uint64_t final_len = blk->len << 3; + size_t pad = 1 + (63 & (55 - blk->used)); + + put_byte(bs, 0x80); + for (size_t i = 1; i < pad; i++) + put_byte(bs, 0); + put_uint64(bs, final_len); + + assert(blk->used == 0 && "Should have exactly hit a block boundary"); +} + +/* ---------------------------------------------------------------------- + * Software implementation of SHA-1. + */ + +static inline uint32_t rol(uint32_t x, unsigned y) +{ + return (x << (31 & y)) | (x >> (31 & -y)); +} + +static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0) +{ + return if0 ^ (ctrl & (if1 ^ if0)); +} + +static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z) +{ + return (x & y) | (z & (x | y)); +} + +static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z) +{ + return (x ^ y ^ z); +} + +static inline void sha1_sw_round( + unsigned round_index, const uint32_t *schedule, + uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e, + uint32_t f, uint32_t constant) +{ + *e = rol(*a, 5) + f + *e + schedule[round_index] + constant; + *b = rol(*b, 30); +} + +static void sha1_sw_block(uint32_t *core, const uint8_t *block) +{ + uint32_t w[SHA1_ROUNDS]; + uint32_t a,b,c,d,e; + + for (size_t t = 0; t < 16; t++) + w[t] = GET_32BIT_MSB_FIRST(block + 4*t); + + for (size_t t = 16; t < SHA1_ROUNDS; t++) + w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1); + + a = core[0]; b = core[1]; c = core[2]; d = core[3]; + e = core[4]; + + size_t t = 0; + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT); + } + for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) { + sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT); + sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT); + } + + core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e; + + smemclr(w, sizeof(w)); +} + +typedef struct sha1_sw { + uint32_t core[5]; + sha1_block blk; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_sw; + +static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len); + +static ssh_hash *sha1_sw_new(const ssh_hashalg *alg) +{ + sha1_sw *s = snew(sha1_sw); + + memcpy(s->core, sha1_initial_state, sizeof(s->core)); + + sha1_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_sw_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha1_sw_copy(ssh_hash *hash) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + sha1_sw *copy = snew(sha1_sw); + + memcpy(copy, s, sizeof(*copy)); + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha1_sw_free(ssh_hash *hash) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + + smemclr(s, sizeof(*s)); + sfree(s); +} + +static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_sw_block(s->core, s->blk.block); +} + +static void sha1_sw_final(ssh_hash *hash, uint8_t *digest) +{ + sha1_sw *s = container_of(hash, sha1_sw, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + for (size_t i = 0; i < 5; i++) + PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]); + sha1_sw_free(hash); +} + +const ssh_hashalg ssh_sha1_sw = { + sha1_sw_new, sha1_sw_copy, sha1_sw_final, sha1_sw_free, + 20, 64, "SHA-1", +}; + +/* ---------------------------------------------------------------------- + * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI. + */ + +#if HW_SHA1 == HW_SHA1_NI /* * Set target architecture for Clang and GCC @@ -298,7 +295,7 @@ const ssh_hashalg ssh_sha1 = { # pragma GCC target("sse4.1") #endif -#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) +#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))) # define FUNC_ISA __attribute__ ((target("sse4.1,sha"))) #else # define FUNC_ISA @@ -307,270 +304,353 @@ const ssh_hashalg ssh_sha1 = { #include #include #include - #if defined(__clang__) || defined(__GNUC__) #include #endif -/* - * Determinators of CPU type - */ #if defined(__clang__) || defined(__GNUC__) - #include -bool supports_sha_ni(void) +#define GET_CPU_ID_0(out) \ + __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3]) +#define GET_CPU_ID_7(out) \ + __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3]) +#else +#define GET_CPU_ID_0(out) __cpuid(out, 0) +#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0) +#endif + +static bool sha1_hw_available(void) { unsigned int CPUInfo[4]; - __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); + GET_CPU_ID_0(CPUInfo); if (CPUInfo[0] < 7) return false; - __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); - return CPUInfo[1] & (1 << 29); /* SHA */ -} - -#else /* defined(__clang__) || defined(__GNUC__) */ - -bool supports_sha_ni(void) -{ - unsigned int CPUInfo[4]; - __cpuid(CPUInfo, 0); - if (CPUInfo[0] < 7) - return false; - - __cpuidex(CPUInfo, 7, 0); + GET_CPU_ID_7(CPUInfo); return CPUInfo[1] & (1 << 29); /* Check SHA */ } -#endif /* defined(__clang__) || defined(__GNUC__) */ - /* SHA1 implementation using new instructions The code is based on Jeffrey Walton's SHA1 implementation: https://github.com/noloader/SHA-Intrinsics */ FUNC_ISA -static void sha1_ni_(SHA_State * s, const unsigned char *q, int len) +static inline void sha1_ni_block(__m128i *core, const uint8_t *p) { - if (s->blkused && s->blkused + len < 64) { - /* - * Trivial case: just add to the block. - */ - memcpy(s->block + s->blkused, q, len); - s->blkused += len; - } else { - __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; - const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3; + const __m128i MASK = _mm_set_epi64x( + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); - ABCD = _mm_loadu_si128((const __m128i*) s->h); - E0 = _mm_set_epi32(s->h[4], 0, 0, 0); - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + const __m128i *block = (const __m128i *)p; - /* - * We must complete and process at least one block. - */ - while (s->blkused + len >= 64) - { - __m128i MSG0, MSG1, MSG2, MSG3; - memcpy(s->block + s->blkused, q, 64 - s->blkused); - q += 64 - s->blkused; - len -= 64 - s->blkused; + /* Load initial values */ + ABCD = core[0]; + E0 = core[1]; - /* Save current state */ - ABCD_SAVE = ABCD; - E0_SAVE = E0; + /* Rounds 0-3 */ + MSG0 = _mm_loadu_si128(block); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - /* Rounds 0-3 */ - MSG0 = _mm_loadu_si128((const __m128i*)(s->block + 0)); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128(block + 1); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*)(s->block + 16)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128(block + 2); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*)(s->block + 32)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128(block + 3); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*)(s->block + 48)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + /* Rounds 16-19 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - /* Rounds 16-19 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + /* Rounds 20-23 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - /* Rounds 20-23 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + /* Rounds 24-27 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - /* Rounds 24-27 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + /* Rounds 28-31 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - /* Rounds 28-31 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + /* Rounds 32-35 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - /* Rounds 32-35 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + /* Rounds 36-39 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - /* Rounds 36-39 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + /* Rounds 40-43 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - /* Rounds 40-43 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + /* Rounds 44-47 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - /* Rounds 44-47 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + /* Rounds 48-51 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - /* Rounds 48-51 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + /* Rounds 52-55 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); - /* Rounds 52-55 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); + /* Rounds 56-59 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); - /* Rounds 56-59 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); + /* Rounds 60-63 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); - /* Rounds 60-63 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); + /* Rounds 64-67 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); - /* Rounds 64-67 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); + /* Rounds 68-71 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); - /* Rounds 68-71 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); + /* Rounds 72-75 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - /* Rounds 72-75 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + /* Rounds 76-79 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - /* Rounds 76-79 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - - /* Combine state */ - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); - ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); - - s->blkused = 0; - } - - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - - /* Save state */ - _mm_storeu_si128((__m128i*) s->h, ABCD); - s->h[4] = _mm_extract_epi32(E0, 3); - - memcpy(s->block, q, len); - s->blkused = len; - } + /* Combine state */ + core[0] = _mm_add_epi32(ABCD, core[0]); + core[1] = _mm_sha1nexte_epu32(E0, core[1]); } -/* - * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980 +typedef struct sha1_ni { + /* + * core[0] stores the first four words of the SHA-1 state. core[1] + * stores just the fifth word, in the vector lane at the highest + * address. + */ + __m128i core[2]; + sha1_block blk; + void *pointer_to_free; + BinarySink_IMPLEMENTATION; + ssh_hash hash; +} sha1_ni; + +static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len); + +static sha1_ni *sha1_ni_alloc(void) +{ + /* + * The __m128i variables in the context structure need to be + * 16-byte aligned, but not all malloc implementations that this + * code has to work with will guarantee to return a 16-byte + * aligned pointer. So we over-allocate, manually realign the + * pointer ourselves, and store the original one inside the + * context so we know how to free it later. + */ + void *allocation = smalloc(sizeof(sha1_ni) + 15); + uintptr_t alloc_address = (uintptr_t)allocation; + uintptr_t aligned_address = (alloc_address + 15) & ~15; + sha1_ni *s = (sha1_ni *)aligned_address; + s->pointer_to_free = allocation; + return s; +} + +FUNC_ISA static ssh_hash *sha1_ni_new(const ssh_hashalg *alg) +{ + if (!sha1_hw_available_cached()) + return NULL; + + sha1_ni *s = sha1_ni_alloc(); + + /* Initialise the core vectors in their storage order */ + s->core[0] = _mm_set_epi64x( + 0x67452301efcdab89ULL, 0x98badcfe10325476ULL); + s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0); + + sha1_block_setup(&s->blk); + + s->hash.vt = alg; + BinarySink_INIT(s, sha1_ni_write); + BinarySink_DELEGATE_INIT(&s->hash, s); + return &s->hash; +} + +static ssh_hash *sha1_ni_copy(ssh_hash *hash) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + sha1_ni *copy = sha1_ni_alloc(); + + void *ptf_save = copy->pointer_to_free; + *copy = *s; /* structure copy */ + copy->pointer_to_free = ptf_save; + + BinarySink_COPIED(copy); + BinarySink_DELEGATE_INIT(©->hash, copy); + + return ©->hash; +} + +static void sha1_ni_free(ssh_hash *hash) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + + void *ptf = s->pointer_to_free; + smemclr(s, sizeof(*s)); + sfree(ptf); +} + +static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len) +{ + sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni); + + while (len > 0) + if (sha1_block_write(&s->blk, &vp, &len)) + sha1_ni_block(s->core, s->blk.block); +} + +FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest) +{ + sha1_ni *s = container_of(hash, sha1_ni, hash); + + sha1_block_pad(&s->blk, BinarySink_UPCAST(s)); + + /* Rearrange the first vector into its output order */ + __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B); + + /* Byte-swap it into the output endianness */ + const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12); + abcd = _mm_shuffle_epi8(abcd, mask); + + /* And store it */ + _mm_storeu_si128((__m128i *)digest, abcd); + + /* Finally, store the leftover word */ + uint32_t e = _mm_extract_epi32(s->core[1], 3); + PUT_32BIT_MSB_FIRST(digest + 16, e); + + sha1_ni_free(hash); +} + +const ssh_hashalg ssh_sha1_hw = { + sha1_ni_new, sha1_ni_copy, sha1_ni_final, sha1_ni_free, + 20, 64, "SHA-1", +}; + +/* ---------------------------------------------------------------------- + * Stub functions if we have no hardware-accelerated SHA-1. In this + * case, sha1_hw_new returns NULL (though it should also never be + * selected by sha1_select, so the only thing that should even be + * _able_ to call it is testcrypt). As a result, the remaining vtable + * functions should never be called at all. */ -static void sha1_ni(SHA_State * s, const unsigned char *q, int len) -{ - sha1_ni_(s, q, len); -} -#else /* COMPILER_SUPPORTS_AES_NI */ +#elif HW_SHA1 == HW_SHA1_NONE -static void sha1_ni(SHA_State * s, const unsigned char *q, int len) -{ - unreachable("sha1_ni not compiled in"); -} - -bool supports_sha_ni(void) +static bool sha1_hw_available(void) { return false; } -#endif /* COMPILER_SUPPORTS_AES_NI */ +static ssh_hash *sha1_stub_new(const ssh_hashalg *alg) +{ + return NULL; +} + +#define STUB_BODY { unreachable("Should never be called"); } + +static ssh_hash *sha1_stub_copy(ssh_hash *hash) STUB_BODY +static void sha1_stub_free(ssh_hash *hash) STUB_BODY +static void sha1_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY + +const ssh_hashalg ssh_sha1_hw = { + sha1_stub_new, sha1_stub_copy, sha1_stub_final, sha1_stub_free, + 20, 64, "SHA-1", +}; + +#endif /* HW_SHA1 */ diff --git a/test/cryptsuite.py b/test/cryptsuite.py index 6216f4a7..9ff987a7 100755 --- a/test/cryptsuite.py +++ b/test/cryptsuite.py @@ -1172,6 +1172,46 @@ class crypt(MyTestBase): self.assertEqualBin(data2, expected_data2[:127]) self.assertEqualBin(data3, expected_data3) + def testHashPadding(self): + # A consistency test for hashes that use MD5/SHA-1/SHA-2 style + # padding of the message into a whole number of fixed-size + # blocks. We test-hash a message of every length up to twice + # the block length, to make sure there's no off-by-1 error in + # the code that decides how much padding to put on. + + # Source: generated using Python hashlib as an independent + # implementation. The function below will do it, called with + # parameters such as (hashlib.sha256,128). + # + # def gen_testcase(hashclass, maxlen): + # return hashclass(b''.join(hashclass(text[:i]).digest() + # for i in range(maxlen))).hexdigest() + + text = """ +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do +eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad +minim veniam, quis nostrud exercitation ullamco laboris nisi ut +aliquip ex ea commodo consequat. Duis aute irure dolor in +reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla +pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + """.replace('\n', ' ').strip() + + def test(hashname, maxlen, expected): + assert len(text) >= maxlen + buf = b''.join(hash_str(hashname, text[:i]) + for i in range(maxlen)) + self.assertEqualBin(hash_str(hashname, buf), unhex(expected)) + + test('md5', 128, '8169d766cc3b8df182b3ce756ae19a15') + test('sha1', 128, '3691759577deb3b70f427763a9c15acb9dfc0259') + test('sha256', 128, 'ec539c4d678412c86c13ee4eb9452232' + '35d4eed3368d876fdf10c9df27396640') + test('sha512', 256, + 'cb725b4b4ec0ac1174d69427b4d97848b7db4fc01181f99a8049a4d721862578' + 'f91e026778bb2d389a9dd88153405189e6ba438b213c5387284103d2267fd055' + ) + class standard_test_vectors(MyTestBase): def testAES(self): def vector(cipher, key, plaintext, ciphertext):