diff --git a/ssh.h b/ssh.h
index 3a2e0ece..1e732d78 100644
--- a/ssh.h
+++ b/ssh.h
@@ -558,8 +558,6 @@ struct ssh_cipher {
     const ssh_cipheralg *vt;
 };
 
-bool supports_sha_ni(void);
-
 struct ssh_cipheralg {
     ssh_cipher *(*new)(const ssh_cipheralg *alg);
     void (*free)(ssh_cipher *);
@@ -819,7 +817,11 @@ extern const ssh2_ciphers ssh2_arcfour;
 extern const ssh2_ciphers ssh2_ccp;
 extern const ssh_hashalg ssh_md5;
 extern const ssh_hashalg ssh_sha1;
+extern const ssh_hashalg ssh_sha1_hw;
+extern const ssh_hashalg ssh_sha1_sw;
 extern const ssh_hashalg ssh_sha256;
+extern const ssh_hashalg ssh_sha256_hw;
+extern const ssh_hashalg ssh_sha256_sw;
 extern const ssh_hashalg ssh_sha384;
 extern const ssh_hashalg ssh_sha512;
 extern const ssh_kexes ssh_diffiehellman_group1;
@@ -867,29 +869,6 @@ extern const char sshver[];
  */
 extern bool ssh_fallback_cmd(Backend *backend);
 
-/*
- * Check of compiler version
- */
-#ifdef _FORCE_SHA_NI
-#   define COMPILER_SUPPORTS_SHA_NI
-#elif defined(__clang__)
-#   if __has_attribute(target) && __has_include(<shaintrin.h>) && (defined(__x86_64__) || defined(__i386))
-#       define COMPILER_SUPPORTS_SHA_NI
-#   endif
-#elif defined(__GNUC__)
-#    if ((__GNUC__ >= 5) && (defined(__x86_64__) || defined(__i386)))
-#       define COMPILER_SUPPORTS_SHA_NI
-#    endif
-#elif defined (_MSC_VER)
-#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_VER >= 1900
-#      define COMPILER_SUPPORTS_SHA_NI
-#   endif
-#endif
-
-#ifdef _FORCE_SOFTWARE_SHA
-#   undef COMPILER_SUPPORTS_SHA_NI
-#endif
-
 /*
  * The PRNG type, defined in sshprng.c. Visible data fields are
  * 'savesize', which suggests how many random bytes you should request
diff --git a/sshsh256.c b/sshsh256.c
index fbd8bdb2..11facbf8 100644
--- a/sshsh256.c
+++ b/sshsh256.c
@@ -7,259 +7,302 @@
 #include "ssh.h"
 #include <assert.h>
 
-/* ----------------------------------------------------------------------
- * Core SHA256 algorithm: processes 16-word blocks into a message digest.
+/*
+ * Start by deciding whether we can support hardware SHA at all.
  */
+#define HW_SHA256_NONE 0
+#define HW_SHA256_NI 1
 
-#define ror(x,y) ( ((x) << (32-y)) | (((uint32_t)(x)) >> (y)) )
-#define shr(x,y) ( (((uint32_t)(x)) >> (y)) )
-#define Ch(x,y,z) ( ((x) & (y)) ^ (~(x) & (z)) )
-#define Maj(x,y,z) ( ((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)) )
-#define bigsigma0(x) ( ror((x),2) ^ ror((x),13) ^ ror((x),22) )
-#define bigsigma1(x) ( ror((x),6) ^ ror((x),11) ^ ror((x),25) )
-#define smallsigma0(x) ( ror((x),7) ^ ror((x),18) ^ shr((x),3) )
-#define smallsigma1(x) ( ror((x),17) ^ ror((x),19) ^ shr((x),10) )
+#ifdef _FORCE_SHA_NI
+#   define HW_SHA256 HW_SHA256_NI
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA256 HW_SHA256_NI
+#   endif
+#elif defined(__GNUC__)
+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA256 HW_SHA256_NI
+#    endif
+#elif defined (_MSC_VER)
+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
+#      define HW_SHA256 HW_SHA256_NI
+#   endif
+#endif
 
-typedef struct SHA256_State {
-    uint32_t h[8];
-    unsigned char block[64];
-    int blkused;
-    uint64_t len;
-    void (*sha256)(struct SHA256_State * s, const unsigned char *p, int len);
-    BinarySink_IMPLEMENTATION;
-} SHA256_State;
-
-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len);
-static void SHA256_ni(SHA256_State *s, const unsigned char *q, int len);
-
-void SHA256_Core_Init(SHA256_State *s) {
-    s->h[0] = 0x6a09e667;
-    s->h[1] = 0xbb67ae85;
-    s->h[2] = 0x3c6ef372;
-    s->h[3] = 0xa54ff53a;
-    s->h[4] = 0x510e527f;
-    s->h[5] = 0x9b05688c;
-    s->h[6] = 0x1f83d9ab;
-    s->h[7] = 0x5be0cd19;
-}
-
-void SHA256_Block(SHA256_State *s, uint32_t *block) {
-    uint32_t w[80];
-    uint32_t a,b,c,d,e,f,g,h;
-    static const int k[] = {
-        0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-        0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-        0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-        0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-        0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-        0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-        0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-        0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-        0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-        0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-        0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-        0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-        0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-        0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-        0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-        0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-    };
-
-    int t;
-
-    for (t = 0; t < 16; t++)
-        w[t] = block[t];
-
-    for (t = 16; t < 64; t++)
-	w[t] = smallsigma1(w[t-2]) + w[t-7] + smallsigma0(w[t-15]) + w[t-16];
-
-    a = s->h[0]; b = s->h[1]; c = s->h[2]; d = s->h[3];
-    e = s->h[4]; f = s->h[5]; g = s->h[6]; h = s->h[7];
-
-    for (t = 0; t < 64; t+=8) {
-        uint32_t t1, t2;
-
-#define ROUND(j,a,b,c,d,e,f,g,h) \
-	t1 = h + bigsigma1(e) + Ch(e,f,g) + k[j] + w[j]; \
-	t2 = bigsigma0(a) + Maj(a,b,c); \
-        d = d + t1; h = t1 + t2;
-
-	ROUND(t+0, a,b,c,d,e,f,g,h);
-	ROUND(t+1, h,a,b,c,d,e,f,g);
-	ROUND(t+2, g,h,a,b,c,d,e,f);
-	ROUND(t+3, f,g,h,a,b,c,d,e);
-	ROUND(t+4, e,f,g,h,a,b,c,d);
-	ROUND(t+5, d,e,f,g,h,a,b,c);
-	ROUND(t+6, c,d,e,f,g,h,a,b);
-	ROUND(t+7, b,c,d,e,f,g,h,a);
-    }
-
-    s->h[0] += a; s->h[1] += b; s->h[2] += c; s->h[3] += d;
-    s->h[4] += e; s->h[5] += f; s->h[6] += g; s->h[7] += h;
-}
-
-/* ----------------------------------------------------------------------
- * Outer SHA256 algorithm: take an arbitrary length byte string,
- * convert it into 16-word blocks with the prescribed padding at
- * the end, and pass those blocks to the core SHA256 algorithm.
- */
-
-#define BLKSIZE 64
-
-static void SHA256_BinarySink_write(BinarySink *bs,
-                                    const void *p, size_t len);
-
-void SHA256_Init(SHA256_State *s) {
-    SHA256_Core_Init(s);
-    s->blkused = 0;
-    s->len = 0;
-    if (supports_sha_ni())
-        s->sha256 = &SHA256_ni;
-    else
-        s->sha256 = &SHA256_sw;
-    BinarySink_INIT(s, SHA256_BinarySink_write);
-}
-
-static void SHA256_BinarySink_write(BinarySink *bs,
-                                    const void *p, size_t len)
-{
-    struct SHA256_State *s = BinarySink_DOWNCAST(bs, struct SHA256_State);
-    unsigned char *q = (unsigned char *)p;
-
-    /*
-     * Update the length field.
-     */
-    s->len += len;
-
-    (*(s->sha256))(s, q, len);
-}
-
-static void SHA256_sw(SHA256_State *s, const unsigned char *q, int len) {
-    uint32_t wordblock[16];
-    int i;
-
-    if (s->blkused && s->blkused+len < BLKSIZE) {
-        /*
-         * Trivial case: just add to the block.
-         */
-        memcpy(s->block + s->blkused, q, len);
-        s->blkused += len;
-    } else {
-        /*
-         * We must complete and process at least one block.
-         */
-        while (s->blkused + len >= BLKSIZE) {
-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
-            q += BLKSIZE - s->blkused;
-            len -= BLKSIZE - s->blkused;
-            /* Now process the block. Gather bytes big-endian into words */
-            for (i = 0; i < 16; i++) {
-                wordblock[i] =
-                    ( ((uint32_t)s->block[i*4+0]) << 24 ) |
-                    ( ((uint32_t)s->block[i*4+1]) << 16 ) |
-                    ( ((uint32_t)s->block[i*4+2]) <<  8 ) |
-                    ( ((uint32_t)s->block[i*4+3]) <<  0 );
-            }
-            SHA256_Block(s, wordblock);
-            s->blkused = 0;
-        }
-        memcpy(s->block, q, len);
-        s->blkused = len;
-    }
-}
-
-void SHA256_Final(SHA256_State *s, unsigned char *digest) {
-    int i;
-    int pad;
-    unsigned char c[64];
-    uint64_t len;
-
-    if (s->blkused >= 56)
-        pad = 56 + 64 - s->blkused;
-    else
-        pad = 56 - s->blkused;
-
-    len = (s->len << 3);
-
-    memset(c, 0, pad);
-    c[0] = 0x80;
-    put_data(s, &c, pad);
-
-    put_uint64(s, len);
-
-    for (i = 0; i < 8; i++) {
-	digest[i*4+0] = (s->h[i] >> 24) & 0xFF;
-	digest[i*4+1] = (s->h[i] >> 16) & 0xFF;
-	digest[i*4+2] = (s->h[i] >>  8) & 0xFF;
-	digest[i*4+3] = (s->h[i] >>  0) & 0xFF;
-    }
-}
-
-void SHA256_Simple(const void *p, int len, unsigned char *output) {
-    SHA256_State s;
-
-    SHA256_Init(&s);
-    put_data(&s, p, len);
-    SHA256_Final(&s, output);
-    smemclr(&s, sizeof(s));
-}
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA256
+#   undef HW_SHA256
+#   define HW_SHA256 HW_SHA256_NONE
+#endif
 
 /*
- * Thin abstraction for things where hashes are pluggable.
+ * The actual query function that asks if hardware acceleration is
+ * available.
  */
+static bool sha256_hw_available(void);
 
-struct sha256_hash {
-    SHA256_State state;
-    ssh_hash hash;
-};
-
-static ssh_hash *sha256_new(const ssh_hashalg *alg)
+/*
+ * The top-level selection function, caching the results of
+ * sha256_hw_available() so it only has to run once.
+ */
+static bool sha256_hw_available_cached(void)
 {
-    struct sha256_hash *h = snew(struct sha256_hash);
-    SHA256_Init(&h->state);
-    h->hash.vt = alg;
-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);
-    return &h->hash;
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha256_hw_available();
+        initialised = true;
+    }
+    return hw_available;
 }
 
-static ssh_hash *sha256_copy(ssh_hash *hashold)
+static ssh_hash *sha256_select(const ssh_hashalg *alg)
 {
-    struct sha256_hash *hold, *hnew;
-    ssh_hash *hashnew = sha256_new(hashold->vt);
+    const ssh_hashalg *real_alg =
+        sha256_hw_available_cached() ? &ssh_sha256_hw : &ssh_sha256_sw;
 
-    hold = container_of(hashold, struct sha256_hash, hash);
-    hnew = container_of(hashnew, struct sha256_hash, hash);
-
-    hnew->state = hold->state;
-    BinarySink_COPIED(&hnew->state);
-
-    return hashnew;
-}
-
-static void sha256_free(ssh_hash *hash)
-{
-    struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);
-
-    smemclr(h, sizeof(*h));
-    sfree(h);
-}
-
-static void sha256_final(ssh_hash *hash, unsigned char *output)
-{
-    struct sha256_hash *h = container_of(hash, struct sha256_hash, hash);
-    SHA256_Final(&h->state, output);
-    sha256_free(hash);
+    return ssh_hash_new(real_alg);
 }
 
 const ssh_hashalg ssh_sha256 = {
-    sha256_new, sha256_copy, sha256_final, sha256_free, 32, 64, "SHA-256"
+    sha256_select, NULL, NULL, NULL,
+    32, 64, "SHA-256",
 };
 
-#ifdef COMPILER_SUPPORTS_SHA_NI
+/* ----------------------------------------------------------------------
+ * Definitions likely to be helpful to multiple implementations.
+ */
 
-#if defined _MSC_VER && defined _M_AMD64
-# include <intrin.h>
-#endif
+static const uint32_t sha256_initial_state[] = {
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
+};
+
+static const uint32_t sha256_round_constants[] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define SHA256_ROUNDS 64
+
+typedef struct sha256_block sha256_block;
+struct sha256_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha256_block_setup(sha256_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha256_block_write(
+    sha256_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha256_block_pad(sha256_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+/* ----------------------------------------------------------------------
+ * Software implementation of SHA-256.
+ */
+
+static inline uint32_t ror(uint32_t x, unsigned y)
+{
+    return (x << (31 & -y)) | (x >> (31 & y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Sigma_0(uint32_t x)
+{
+    return ror(x,2) ^ ror(x,13) ^ ror(x,22);
+}
+
+static inline uint32_t Sigma_1(uint32_t x)
+{
+    return ror(x,6) ^ ror(x,11) ^ ror(x,25);
+}
+
+static inline uint32_t sigma_0(uint32_t x)
+{
+    return ror(x,7) ^ ror(x,18) ^ (x >> 3);
+}
+
+static inline uint32_t sigma_1(uint32_t x)
+{
+    return ror(x,17) ^ ror(x,19) ^ (x >> 10);
+}
+
+static inline void sha256_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d,
+    uint32_t *e, uint32_t *f, uint32_t *g, uint32_t *h)
+{
+    uint32_t t1 = *h + Sigma_1(*e) + Ch(*e,*f,*g) +
+        sha256_round_constants[round_index] + schedule[round_index];
+
+    uint32_t t2 = Sigma_0(*a) + Maj(*a,*b,*c);
+
+    *d += t1;
+    *h = t1 + t2;
+}
+
+static void sha256_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA256_ROUNDS];
+    uint32_t a,b,c,d,e,f,g,h;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA256_ROUNDS; t++)
+	w[t] = sigma_1(w[t-2]) + w[t-7] + sigma_0(w[t-15]) + w[t-16];
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4]; f = core[5]; g = core[6]; h = core[7];
+
+    for (size_t t = 0; t < SHA256_ROUNDS; t += 8) {
+	sha256_sw_round(t+0, w, &a,&b,&c,&d,&e,&f,&g,&h);
+	sha256_sw_round(t+1, w, &h,&a,&b,&c,&d,&e,&f,&g);
+	sha256_sw_round(t+2, w, &g,&h,&a,&b,&c,&d,&e,&f);
+	sha256_sw_round(t+3, w, &f,&g,&h,&a,&b,&c,&d,&e);
+	sha256_sw_round(t+4, w, &e,&f,&g,&h,&a,&b,&c,&d);
+	sha256_sw_round(t+5, w, &d,&e,&f,&g,&h,&a,&b,&c);
+	sha256_sw_round(t+6, w, &c,&d,&e,&f,&g,&h,&a,&b);
+	sha256_sw_round(t+7, w, &b,&c,&d,&e,&f,&g,&h,&a);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d;
+    core[4] += e; core[5] += f; core[6] += g; core[7] += h;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha256_sw {
+    uint32_t core[8];
+    sha256_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_sw;
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha256_sw_new(const ssh_hashalg *alg)
+{
+    sha256_sw *s = snew(sha256_sw);
+
+    memcpy(s->core, sha256_initial_state, sizeof(s->core));
+
+    sha256_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha256_sw_copy(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+    sha256_sw *copy = snew(sha256_sw);
+
+    memcpy(copy, s, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha256_sw_free(ssh_hash *hash)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha256_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_sw *s = BinarySink_DOWNCAST(bs, sha256_sw);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_sw_block(s->core, s->blk.block);
+}
+
+static void sha256_sw_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_sw *s = container_of(hash, sha256_sw, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 8; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+    sha256_sw_free(hash);
+}
+
+const ssh_hashalg ssh_sha256_sw = {
+    sha256_sw_new, sha256_sw_copy, sha256_sw_final, sha256_sw_free,
+    32, 64, "SHA-256",
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-256 using x86 SHA-NI.
+ */
+
+#if HW_SHA256 == HW_SHA256_NI
 
 /*
  * Set target architecture for Clang and GCC
@@ -269,7 +312,7 @@ const ssh_hashalg ssh_sha256 = {
 #    pragma GCC target("sse4.1")
 #endif
 
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
 #    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
 #else
 #    define FUNC_ISA
@@ -278,236 +321,369 @@ const ssh_hashalg ssh_sha256 = {
 #include <wmmintrin.h>
 #include <smmintrin.h>
 #include <immintrin.h>
-
 #if defined(__clang__) || defined(__GNUC__)
 #include <shaintrin.h>
 #endif
 
+#if defined(__clang__) || defined(__GNUC__)
+#include <cpuid.h>
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha256_hw_available(void)
+{
+    unsigned int CPUInfo[4];
+    GET_CPU_ID_0(CPUInfo);  
+    if (CPUInfo[0] < 7)
+        return false;
+
+    GET_CPU_ID_7(CPUInfo);
+    return CPUInfo[1] & (1 << 29); /* Check SHA */
+}
+
 /* SHA256 implementation using new instructions
    The code is based on Jeffrey Walton's SHA256 implementation:
    https://github.com/noloader/SHA-Intrinsics
 */
 FUNC_ISA
-static void SHA256_ni_(SHA256_State * s, const unsigned char *q, int len) {
-    if (s->blkused && s->blkused+len < BLKSIZE) {
-        /*
-         * Trivial case: just add to the block.
-         */
-        memcpy(s->block + s->blkused, q, len);
-        s->blkused += len;
-    } else {
-        __m128i STATE0, STATE1;
-        __m128i MSG, TMP;
-        __m128i MSG0, MSG1, MSG2, MSG3;
-        __m128i ABEF_SAVE, CDGH_SAVE;
-        const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
+static inline void sha256_ni_block(__m128i *core, const uint8_t *p)
+{
+    __m128i STATE0, STATE1;
+    __m128i MSG, TMP;
+    __m128i MSG0, MSG1, MSG2, MSG3;
+    const __m128i *block = (const __m128i *)p;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL);
 
-        /* Load initial values */
-        TMP = _mm_loadu_si128((const __m128i*) &s->h[0]);
-        STATE1 = _mm_loadu_si128((const __m128i*) &s->h[4]);
+    /* Load initial values */
+    STATE0 = core[0];
+    STATE1 = core[1];
 
-        TMP = _mm_shuffle_epi32(TMP, 0xB1);          /* CDAB */
-        STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);    /* EFGH */
-        STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);    /* ABEF */
-        STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */
-        /*
-         * We must complete and process at least one block.
-         */
-        while (s->blkused + len >= BLKSIZE) {
-            memcpy(s->block + s->blkused, q, BLKSIZE - s->blkused);
-            q += BLKSIZE - s->blkused;
-            len -= BLKSIZE - s->blkused;
+    /* Rounds 0-3 */
+    MSG = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG, MASK);
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
 
-                /* Save current state */
-            ABEF_SAVE = STATE0;
-            CDGH_SAVE = STATE1;
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
 
-            /* Rounds 0-3 */
-            MSG = _mm_loadu_si128((const __m128i*) (s->block + 0));
-            MSG0 = _mm_shuffle_epi8(MSG, MASK);
-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
 
-            /* Rounds 4-7 */
-            MSG1 = _mm_loadu_si128((const __m128i*) (s->block + 16));
-            MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
 
-            /* Rounds 8-11 */
-            MSG2 = _mm_loadu_si128((const __m128i*) (s->block + 32));
-            MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+    /* Rounds 16-19 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
 
-            /* Rounds 12-15 */
-            MSG3 = _mm_loadu_si128((const __m128i*) (s->block + 48));
-            MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-            MSG0 = _mm_add_epi32(MSG0, TMP);
-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+    /* Rounds 20-23 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
 
-            /* Rounds 16-19 */
-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-            MSG1 = _mm_add_epi32(MSG1, TMP);
-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+    /* Rounds 24-27 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
 
-            /* Rounds 20-23 */
-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-            MSG2 = _mm_add_epi32(MSG2, TMP);
-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+    /* Rounds 28-31 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
 
-            /* Rounds 24-27 */
-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-            MSG3 = _mm_add_epi32(MSG3, TMP);
-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+    /* Rounds 32-35 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
 
-            /* Rounds 28-31 */
-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL,  0xD5A79147C6E00BF3ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-            MSG0 = _mm_add_epi32(MSG0, TMP);
-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+    /* Rounds 36-39 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
 
-            /* Rounds 32-35 */
-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-            MSG1 = _mm_add_epi32(MSG1, TMP);
-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+    /* Rounds 40-43 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
 
-            /* Rounds 36-39 */
-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-            MSG2 = _mm_add_epi32(MSG2, TMP);
-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1);
+    /* Rounds 44-47 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
+    MSG0 = _mm_add_epi32(MSG0, TMP);
+    MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
 
-            /* Rounds 40-43 */
-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-            MSG3 = _mm_add_epi32(MSG3, TMP);
-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2);
+    /* Rounds 48-51 */
+    MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(
+                            0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
+    MSG1 = _mm_add_epi32(MSG1, TMP);
+    MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
 
-            /* Rounds 44-47 */
-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG3, MSG2, 4);
-            MSG0 = _mm_add_epi32(MSG0, TMP);
-            MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3);
+    /* Rounds 52-55 */
+    MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(
+                            0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
+    MSG2 = _mm_add_epi32(MSG2, TMP);
+    MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
 
-            /* Rounds 48-51 */
-            MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG0, MSG3, 4);
-            MSG1 = _mm_add_epi32(MSG1, TMP);
-            MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-            MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0);
+    /* Rounds 56-59 */
+    MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(
+                            0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
+    MSG3 = _mm_add_epi32(MSG3, TMP);
+    MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
 
-            /* Rounds 52-55 */
-            MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG1, MSG0, 4);
-            MSG2 = _mm_add_epi32(MSG2, TMP);
-            MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
+    /* Rounds 60-63 */
+    MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(
+                            0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
+    STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
+    MSG = _mm_shuffle_epi32(MSG, 0x0E);
+    STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
 
-            /* Rounds 56-59 */
-            MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            TMP = _mm_alignr_epi8(MSG2, MSG1, 4);
-            MSG3 = _mm_add_epi32(MSG3, TMP);
-            MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-            /* Rounds 60-63 */
-            MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL));
-            STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
-            MSG = _mm_shuffle_epi32(MSG, 0x0E);
-            STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
-
-            /* Combine state  */
-            STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
-            STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
-
-            s->blkused = 0;
-        }
-
-        TMP = _mm_shuffle_epi32(STATE0, 0x1B);       /* FEBA */
-        STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);    /* DCHG */
-        STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */
-        STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);    /* ABEF */
-
-        /* Save state */
-        _mm_storeu_si128((__m128i*) &s->h[0], STATE0);
-        _mm_storeu_si128((__m128i*) &s->h[4], STATE1);
-
-        memcpy(s->block, q, len);
-        s->blkused = len;
-    }
+    /* Combine state */
+    core[0] = _mm_add_epi32(STATE0, core[0]);
+    core[1] = _mm_add_epi32(STATE1, core[1]);
 }
 
-/*
- * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980
+typedef struct sha256_ni {
+    /*
+     * These two vectors store the 8 words of the SHA-256 state, but
+     * not in the same order they appear in the spec: the first word
+     * holds A,B,E,F and the second word C,D,G,H.
+     */
+    __m128i core[2];
+    sha256_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha256_ni;
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha256_ni *sha256_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha256_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha256_ni *s = (sha256_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+FUNC_ISA static ssh_hash *sha256_ni_new(const ssh_hashalg *alg)
+{
+    if (!sha256_hw_available_cached())
+        return NULL;
+
+    sha256_ni *s = sha256_ni_alloc();
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x6a09e667bb67ae85ULL, 0x510e527f9b05688cULL);
+    s->core[1] = _mm_set_epi64x(
+        0x3c6ef372a54ff53aULL, 0x1f83d9ab5be0cd19ULL);
+
+    sha256_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha256_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha256_ni_copy(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+    sha256_ni *copy = sha256_ni_alloc();
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *s; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha256_ni_free(ssh_hash *hash)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha256_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha256_ni *s = BinarySink_DOWNCAST(bs, sha256_ni);
+
+    while (len > 0)
+        if (sha256_block_write(&s->blk, &vp, &len))
+            sha256_ni_block(s->core, s->blk.block);
+}
+
+FUNC_ISA static void sha256_ni_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha256_ni *s = container_of(hash, sha256_ni, hash);
+
+    sha256_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the words into the output order */
+    __m128i feba = _mm_shuffle_epi32(s->core[0], 0x1B);
+    __m128i dchg = _mm_shuffle_epi32(s->core[1], 0xB1);
+    __m128i dcba = _mm_blend_epi16(feba, dchg, 0xF0);
+    __m128i hgfe = _mm_alignr_epi8(dchg, feba, 8);
+
+    /* Byte-swap them into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    dcba = _mm_shuffle_epi8(dcba, mask);
+    hgfe = _mm_shuffle_epi8(hgfe, mask);
+
+    /* And store them */
+    __m128i *output = (__m128i *)digest;
+    _mm_storeu_si128(output, dcba);
+    _mm_storeu_si128(output+1, hgfe);
+
+    sha256_ni_free(hash);
+}
+
+const ssh_hashalg ssh_sha256_hw = {
+    sha256_ni_new, sha256_ni_copy, sha256_ni_final, sha256_ni_free,
+    32, 64, "SHA-256",
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-256. In this
+ * case, sha256_hw_new returns NULL (though it should also never be
+ * selected by sha256_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
  */
-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
+
+#elif HW_SHA256 == HW_SHA256_NONE
+
+static bool sha256_hw_available(void)
 {
-    SHA256_ni_(s, q, len);
+    return false;
 }
 
-#else /* COMPILER_SUPPORTS_AES_NI */
-
-static void SHA256_ni(SHA256_State * s, const unsigned char *q, int len)
+static ssh_hash *sha256_stub_new(const ssh_hashalg *alg)
 {
-    unreachable("SHA256_ni not compiled in");
+    return NULL;
 }
 
-#endif  /* COMPILER_SUPPORTS_AES_NI */
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static ssh_hash *sha256_stub_copy(ssh_hash *hash) STUB_BODY
+static void sha256_stub_free(ssh_hash *hash) STUB_BODY
+static void sha256_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha256_hw = {
+    sha256_stub_new, sha256_stub_copy, sha256_stub_final, sha256_stub_free,
+    32, 64, "SHA-256",
+};
+
+#endif /* HW_SHA256 */
diff --git a/sshsha.c b/sshsha.c
index 82f1c913..c791760c 100644
--- a/sshsha.c
+++ b/sshsha.c
@@ -1,294 +1,291 @@
 /*
- * SHA1 hash algorithm. Used in SSH-2 as a MAC, and the transform is
- * also used as a `stirring' function for the PuTTY random number
- * pool. Implemented directly from the specification by Simon
- * Tatham.
+ * SHA-1 algorithm as described at
+ * 
+ *   http://csrc.nist.gov/cryptval/shs.html
  */
 
 #include "ssh.h"
-
 #include <assert.h>
 
-typedef struct SHA_State {
-    uint32_t h[5];
-    unsigned char block[64];
-    int blkused;
-    uint64_t len;
-    void (*sha1)(struct SHA_State * s, const unsigned char *p, int len);
-    BinarySink_IMPLEMENTATION;
-} SHA_State;
-
-/* ----------------------------------------------------------------------
- * Core SHA algorithm: processes 16-word blocks into a message digest.
+/*
+ * Start by deciding whether we can support hardware SHA at all.
  */
+#define HW_SHA1_NONE 0
+#define HW_SHA1_NI 1
 
-#define rol(x,y) ( ((x) << (y)) | (((uint32_t)x) >> (32-y)) )
-
-static void sha1_sw(SHA_State * s, const unsigned char *q, int len);
-static void sha1_ni(SHA_State * s, const unsigned char *q, int len);
-
-static void SHA_Core_Init(uint32_t h[5])
-{
-    h[0] = 0x67452301;
-    h[1] = 0xefcdab89;
-    h[2] = 0x98badcfe;
-    h[3] = 0x10325476;
-    h[4] = 0xc3d2e1f0;
-}
-
-void SHATransform(uint32_t * digest, uint32_t * block)
-{
-    uint32_t w[80];
-    uint32_t a, b, c, d, e;
-    int t;
-
-#ifdef RANDOM_DIAGNOSTICS
-    {
-        extern int random_diagnostics;
-        if (random_diagnostics) {
-            int i;
-            printf("SHATransform:");
-            for (i = 0; i < 5; i++)
-                printf(" %08x", digest[i]);
-            printf(" +");
-            for (i = 0; i < 16; i++)
-                printf(" %08x", block[i]);
-        }
-    }
+#ifdef _FORCE_SHA_NI
+#   define HW_SHA1 HW_SHA1_NI
+#elif defined(__clang__)
+#   if __has_attribute(target) && __has_include(<wmmintrin.h>) &&       \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA1 HW_SHA1_NI
+#   endif
+#elif defined(__GNUC__)
+#    if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) && \
+    (defined(__x86_64__) || defined(__i386))
+#       define HW_SHA1 HW_SHA1_NI
+#    endif
+#elif defined (_MSC_VER)
+#   if (defined(_M_X64) || defined(_M_IX86)) && _MSC_FULL_VER >= 150030729
+#      define HW_SHA1 HW_SHA1_NI
+#   endif
 #endif
 
-    for (t = 0; t < 16; t++)
-	w[t] = block[t];
-
-    for (t = 16; t < 80; t++) {
-	uint32_t tmp = w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16];
-	w[t] = rol(tmp, 1);
-    }
-
-    a = digest[0];
-    b = digest[1];
-    c = digest[2];
-    d = digest[3];
-    e = digest[4];
-
-    for (t = 0; t < 20; t++) {
-	uint32_t tmp =
-	    rol(a, 5) + ((b & c) | (d & ~b)) + e + w[t] + 0x5a827999;
-	e = d;
-	d = c;
-	c = rol(b, 30);
-	b = a;
-	a = tmp;
-    }
-    for (t = 20; t < 40; t++) {
-	uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0x6ed9eba1;
-	e = d;
-	d = c;
-	c = rol(b, 30);
-	b = a;
-	a = tmp;
-    }
-    for (t = 40; t < 60; t++) {
-	uint32_t tmp = rol(a,
-			 5) + ((b & c) | (b & d) | (c & d)) + e + w[t] +
-	    0x8f1bbcdc;
-	e = d;
-	d = c;
-	c = rol(b, 30);
-	b = a;
-	a = tmp;
-    }
-    for (t = 60; t < 80; t++) {
-	uint32_t tmp = rol(a, 5) + (b ^ c ^ d) + e + w[t] + 0xca62c1d6;
-	e = d;
-	d = c;
-	c = rol(b, 30);
-	b = a;
-	a = tmp;
-    }
-
-    digest[0] += a;
-    digest[1] += b;
-    digest[2] += c;
-    digest[3] += d;
-    digest[4] += e;
-
-#ifdef RANDOM_DIAGNOSTICS
-    {
-        extern int random_diagnostics;
-        if (random_diagnostics) {
-            int i;
-            printf(" =");
-            for (i = 0; i < 5; i++)
-                printf(" %08x", digest[i]);
-            printf("\n");
-        }
-    }
+#if defined _FORCE_SOFTWARE_SHA || !defined HW_SHA1
+#   undef HW_SHA1
+#   define HW_SHA1 HW_SHA1_NONE
 #endif
-}
-
-/* ----------------------------------------------------------------------
- * Outer SHA algorithm: take an arbitrary length byte string,
- * convert it into 16-word blocks with the prescribed padding at
- * the end, and pass those blocks to the core SHA algorithm.
- */
-
-static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len);
-
-void SHA_Init(SHA_State * s)
-{
-    SHA_Core_Init(s->h);
-    s->blkused = 0;
-    s->len = 0;
-    if (supports_sha_ni())
-        s->sha1 = &sha1_ni;
-    else
-        s->sha1 = &sha1_sw;
-    BinarySink_INIT(s, SHA_BinarySink_write);
-}
-
-static void SHA_BinarySink_write(BinarySink *bs, const void *p, size_t len)
-{
-    struct SHA_State *s = BinarySink_DOWNCAST(bs, struct SHA_State);
-    const unsigned char *q = (const unsigned char *) p;
-
-    /*
-     * Update the length field.
-     */
-    s->len += len;
-
-    (*(s->sha1))(s, q, len);
-}
-
-static void sha1_sw(SHA_State * s, const unsigned char *q, int len)
-{
-    uint32_t wordblock[16];
-    int i;
-
-    if (s->blkused && s->blkused + len < 64) {
-	/*
-	 * Trivial case: just add to the block.
-	 */
-	memcpy(s->block + s->blkused, q, len);
-	s->blkused += len;
-    } else {
-	/*
-	 * We must complete and process at least one block.
-	 */
-	while (s->blkused + len >= 64) {
-	    memcpy(s->block + s->blkused, q, 64 - s->blkused);
-	    q += 64 - s->blkused;
-	    len -= 64 - s->blkused;
-	    /* Now process the block. Gather bytes big-endian into words */
-	    for (i = 0; i < 16; i++) {
-		wordblock[i] =
-		    (((uint32_t) s->block[i * 4 + 0]) << 24) |
-		    (((uint32_t) s->block[i * 4 + 1]) << 16) |
-		    (((uint32_t) s->block[i * 4 + 2]) << 8) |
-		    (((uint32_t) s->block[i * 4 + 3]) << 0);
-	    }
-	    SHATransform(s->h, wordblock);
-	    s->blkused = 0;
-	}
-	memcpy(s->block, q, len);
-	s->blkused = len;
-    }
-}
-
-void SHA_Final(SHA_State * s, unsigned char *output)
-{
-    int i;
-    int pad;
-    unsigned char c[64];
-    uint64_t len;
-
-    if (s->blkused >= 56)
-	pad = 56 + 64 - s->blkused;
-    else
-	pad = 56 - s->blkused;
-
-    len = (s->len << 3);
-
-    memset(c, 0, pad);
-    c[0] = 0x80;
-    put_data(s, &c, pad);
-
-    put_uint64(s, len);
-
-    for (i = 0; i < 5; i++) {
-	output[i * 4] = (s->h[i] >> 24) & 0xFF;
-	output[i * 4 + 1] = (s->h[i] >> 16) & 0xFF;
-	output[i * 4 + 2] = (s->h[i] >> 8) & 0xFF;
-	output[i * 4 + 3] = (s->h[i]) & 0xFF;
-    }
-}
-
-void SHA_Simple(const void *p, int len, unsigned char *output)
-{
-    SHA_State s;
-
-    SHA_Init(&s);
-    put_data(&s, p, len);
-    SHA_Final(&s, output);
-    smemclr(&s, sizeof(s));
-}
 
 /*
- * Thin abstraction for things where hashes are pluggable.
+ * The actual query function that asks if hardware acceleration is
+ * available.
  */
+static bool sha1_hw_available(void);
 
-struct sha1_hash {
-    SHA_State state;
-    ssh_hash hash;
-};
-
-static ssh_hash *sha1_new(const ssh_hashalg *alg)
+/*
+ * The top-level selection function, caching the results of
+ * sha1_hw_available() so it only has to run once.
+ */
+static bool sha1_hw_available_cached(void)
 {
-    struct sha1_hash *h = snew(struct sha1_hash);
-    SHA_Init(&h->state);
-    h->hash.vt = alg;
-    BinarySink_DELEGATE_INIT(&h->hash, &h->state);
-    return &h->hash;
+    static bool initialised = false;
+    static bool hw_available;
+    if (!initialised) {
+        hw_available = sha1_hw_available();
+        initialised = true;
+    }
+    return hw_available;
 }
 
-static ssh_hash *sha1_copy(ssh_hash *hashold)
+static ssh_hash *sha1_select(const ssh_hashalg *alg)
 {
-    struct sha1_hash *hold, *hnew;
-    ssh_hash *hashnew = sha1_new(hashold->vt);
+    const ssh_hashalg *real_alg =
+        sha1_hw_available_cached() ? &ssh_sha1_hw : &ssh_sha1_sw;
 
-    hold = container_of(hashold, struct sha1_hash, hash);
-    hnew = container_of(hashnew, struct sha1_hash, hash);
-
-    hnew->state = hold->state;
-    BinarySink_COPIED(&hnew->state);
-
-    return hashnew;
-}
-
-static void sha1_free(ssh_hash *hash)
-{
-    struct sha1_hash *h = container_of(hash, struct sha1_hash, hash);
-
-    smemclr(h, sizeof(*h));
-    sfree(h);
-}
-
-static void sha1_final(ssh_hash *hash, unsigned char *output)
-{
-    struct sha1_hash *h = container_of(hash, struct sha1_hash, hash);
-    SHA_Final(&h->state, output);
-    sha1_free(hash);
+    return ssh_hash_new(real_alg);
 }
 
 const ssh_hashalg ssh_sha1 = {
-    sha1_new, sha1_copy, sha1_final, sha1_free, 20, 64, "SHA-1"
+    sha1_select, NULL, NULL, NULL,
+    20, 64, "SHA-1",
 };
 
-#ifdef COMPILER_SUPPORTS_SHA_NI
+/* ----------------------------------------------------------------------
+ * Definitions likely to be helpful to multiple implementations.
+ */
 
-#if defined _MSC_VER && defined _M_AMD64
-# include <intrin.h>
-#endif
+static const uint32_t sha1_initial_state[] = {
+    0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0,
+};
+
+#define SHA1_ROUNDS_PER_STAGE 20
+#define SHA1_STAGE0_CONSTANT 0x5a827999
+#define SHA1_STAGE1_CONSTANT 0x6ed9eba1
+#define SHA1_STAGE2_CONSTANT 0x8f1bbcdc
+#define SHA1_STAGE3_CONSTANT 0xca62c1d6
+#define SHA1_ROUNDS (4 * SHA1_ROUNDS_PER_STAGE)
+
+typedef struct sha1_block sha1_block;
+struct sha1_block {
+    uint8_t block[64];
+    size_t used;
+    uint64_t len;
+};
+
+static inline void sha1_block_setup(sha1_block *blk)
+{
+    blk->used = 0;
+    blk->len = 0;
+}
+
+static inline bool sha1_block_write(
+    sha1_block *blk, const void **vdata, size_t *len)
+{
+    size_t blkleft = sizeof(blk->block) - blk->used;
+    size_t chunk = *len < blkleft ? *len : blkleft;
+
+    const uint8_t *p = *vdata;
+    memcpy(blk->block + blk->used, p, chunk);
+    *vdata = p + chunk;
+    *len -= chunk;
+    blk->used += chunk;
+    blk->len += chunk;
+
+    if (blk->used == sizeof(blk->block)) {
+        blk->used = 0;
+        return true;
+    }
+
+    return false;
+}
+
+static inline void sha1_block_pad(sha1_block *blk, BinarySink *bs)
+{
+    uint64_t final_len = blk->len << 3;
+    size_t pad = 1 + (63 & (55 - blk->used));
+
+    put_byte(bs, 0x80);
+    for (size_t i = 1; i < pad; i++)
+        put_byte(bs, 0);
+    put_uint64(bs, final_len);
+
+    assert(blk->used == 0 && "Should have exactly hit a block boundary");
+}
+
+/* ----------------------------------------------------------------------
+ * Software implementation of SHA-1.
+ */
+
+static inline uint32_t rol(uint32_t x, unsigned y)
+{
+    return (x << (31 & y)) | (x >> (31 & -y));
+}
+
+static inline uint32_t Ch(uint32_t ctrl, uint32_t if1, uint32_t if0)
+{
+    return if0 ^ (ctrl & (if1 ^ if0));
+}
+
+static inline uint32_t Maj(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x & y) | (z & (x | y));
+}
+
+static inline uint32_t Par(uint32_t x, uint32_t y, uint32_t z)
+{
+    return (x ^ y ^ z);
+}
+
+static inline void sha1_sw_round(
+    unsigned round_index, const uint32_t *schedule,
+    uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d, uint32_t *e,
+    uint32_t f, uint32_t constant)
+{
+    *e = rol(*a, 5) + f + *e + schedule[round_index] + constant;
+    *b = rol(*b, 30);
+}
+
+static void sha1_sw_block(uint32_t *core, const uint8_t *block)
+{
+    uint32_t w[SHA1_ROUNDS];
+    uint32_t a,b,c,d,e;
+
+    for (size_t t = 0; t < 16; t++)
+        w[t] = GET_32BIT_MSB_FIRST(block + 4*t);
+
+    for (size_t t = 16; t < SHA1_ROUNDS; t++)
+	w[t] = rol(w[t - 3] ^ w[t - 8] ^ w[t - 14] ^ w[t - 16], 1);
+
+    a = core[0]; b = core[1]; c = core[2]; d = core[3];
+    e = core[4];
+
+    size_t t = 0;
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Ch(b,c,d), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Ch(a,b,c), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Ch(e,a,b), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Ch(d,e,a), SHA1_STAGE0_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Ch(c,d,e), SHA1_STAGE0_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE1_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE1_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Maj(b,c,d), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Maj(a,b,c), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Maj(e,a,b), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Maj(d,e,a), SHA1_STAGE2_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Maj(c,d,e), SHA1_STAGE2_CONSTANT);
+    }
+    for (size_t u = 0; u < SHA1_ROUNDS_PER_STAGE/5; u++) {
+        sha1_sw_round(t++,w, &a,&b,&c,&d,&e, Par(b,c,d), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &e,&a,&b,&c,&d, Par(a,b,c), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &d,&e,&a,&b,&c, Par(e,a,b), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &c,&d,&e,&a,&b, Par(d,e,a), SHA1_STAGE3_CONSTANT);
+        sha1_sw_round(t++,w, &b,&c,&d,&e,&a, Par(c,d,e), SHA1_STAGE3_CONSTANT);
+    }
+
+    core[0] += a; core[1] += b; core[2] += c; core[3] += d; core[4] += e;
+
+    smemclr(w, sizeof(w));
+}
+
+typedef struct sha1_sw {
+    uint32_t core[5];
+    sha1_block blk;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_sw;
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len);
+
+static ssh_hash *sha1_sw_new(const ssh_hashalg *alg)
+{
+    sha1_sw *s = snew(sha1_sw);
+
+    memcpy(s->core, sha1_initial_state, sizeof(s->core));
+
+    sha1_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_sw_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha1_sw_copy(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+    sha1_sw *copy = snew(sha1_sw);
+
+    memcpy(copy, s, sizeof(*copy));
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha1_sw_free(ssh_hash *hash)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    smemclr(s, sizeof(*s));
+    sfree(s);
+}
+
+static void sha1_sw_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_sw *s = BinarySink_DOWNCAST(bs, sha1_sw);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_sw_block(s->core, s->blk.block);
+}
+
+static void sha1_sw_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_sw *s = container_of(hash, sha1_sw, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+    for (size_t i = 0; i < 5; i++)
+        PUT_32BIT_MSB_FIRST(digest + 4*i, s->core[i]);
+    sha1_sw_free(hash);
+}
+
+const ssh_hashalg ssh_sha1_sw = {
+    sha1_sw_new, sha1_sw_copy, sha1_sw_final, sha1_sw_free,
+    20, 64, "SHA-1",
+};
+
+/* ----------------------------------------------------------------------
+ * Hardware-accelerated implementation of SHA-1 using x86 SHA-NI.
+ */
+
+#if HW_SHA1 == HW_SHA1_NI
 
 /*
  * Set target architecture for Clang and GCC
@@ -298,7 +295,7 @@ const ssh_hashalg ssh_sha1 = {
 #    pragma GCC target("sse4.1")
 #endif
 
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))
 #    define FUNC_ISA __attribute__ ((target("sse4.1,sha")))
 #else
 #    define FUNC_ISA
@@ -307,270 +304,353 @@ const ssh_hashalg ssh_sha1 = {
 #include <wmmintrin.h>
 #include <smmintrin.h>
 #include <immintrin.h>
-
 #if defined(__clang__) || defined(__GNUC__)
 #include <shaintrin.h>
 #endif
 
-/*
- * Determinators of CPU type
- */
 #if defined(__clang__) || defined(__GNUC__)
-
 #include <cpuid.h>
-bool supports_sha_ni(void)
+#define GET_CPU_ID_0(out)                               \
+    __cpuid(0, (out)[0], (out)[1], (out)[2], (out)[3])
+#define GET_CPU_ID_7(out)                                       \
+    __cpuid_count(7, 0, (out)[0], (out)[1], (out)[2], (out)[3])
+#else
+#define GET_CPU_ID_0(out) __cpuid(out, 0)
+#define GET_CPU_ID_7(out) __cpuidex(out, 7, 0)
+#endif
+
+static bool sha1_hw_available(void)
 {
     unsigned int CPUInfo[4];
-    __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
+    GET_CPU_ID_0(CPUInfo);  
     if (CPUInfo[0] < 7)
         return false;
 
-    __cpuid_count(7, 0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
-    return CPUInfo[1] & (1 << 29); /* SHA */
-}
-
-#else /* defined(__clang__) || defined(__GNUC__) */
-
-bool supports_sha_ni(void)
-{
-    unsigned int CPUInfo[4];
-    __cpuid(CPUInfo, 0);  
-    if (CPUInfo[0] < 7)
-        return false;
-
-    __cpuidex(CPUInfo, 7, 0);
+    GET_CPU_ID_7(CPUInfo);
     return CPUInfo[1] & (1 << 29); /* Check SHA */
 }
 
-#endif /* defined(__clang__) || defined(__GNUC__) */
-
 /* SHA1 implementation using new instructions
    The code is based on Jeffrey Walton's SHA1 implementation:
    https://github.com/noloader/SHA-Intrinsics
 */
 FUNC_ISA
-static void sha1_ni_(SHA_State * s, const unsigned char *q, int len)
+static inline void sha1_ni_block(__m128i *core, const uint8_t *p)
 {
-    if (s->blkused && s->blkused + len < 64) {
-      /*
-       * Trivial case: just add to the block.
-       */
-       memcpy(s->block + s->blkused, q, len);
-       s->blkused += len;
-    } else {
-        __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
-        const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
+    __m128i ABCD, E0, E1, MSG0, MSG1, MSG2, MSG3;
+    const __m128i MASK = _mm_set_epi64x(
+        0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
 
-        ABCD = _mm_loadu_si128((const __m128i*) s->h);
-        E0 = _mm_set_epi32(s->h[4], 0, 0, 0);
-        ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
+    const __m128i *block = (const __m128i *)p;
 
-        /*
-         * We must complete and process at least one block.
-         */
-        while (s->blkused + len >= 64)
-        {
-            __m128i MSG0, MSG1, MSG2, MSG3;
-            memcpy(s->block + s->blkused, q, 64 - s->blkused);
-            q += 64 - s->blkused;
-            len -= 64 - s->blkused;
+    /* Load initial values */
+    ABCD = core[0];
+    E0 = core[1];
 
-            /* Save current state  */
-            ABCD_SAVE = ABCD;
-            E0_SAVE = E0;
+    /* Rounds 0-3 */
+    MSG0 = _mm_loadu_si128(block);
+    MSG0 = _mm_shuffle_epi8(MSG0, MASK);
+    E0 = _mm_add_epi32(E0, MSG0);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
 
-            /* Rounds 0-3 */
-            MSG0 = _mm_loadu_si128((const __m128i*)(s->block + 0));
-            MSG0 = _mm_shuffle_epi8(MSG0, MASK);
-            E0 = _mm_add_epi32(E0, MSG0);
-            E1 = ABCD;
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    /* Rounds 4-7 */
+    MSG1 = _mm_loadu_si128(block + 1);
+    MSG1 = _mm_shuffle_epi8(MSG1, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
 
-            /* Rounds 4-7 */
-            MSG1 = _mm_loadu_si128((const __m128i*)(s->block + 16));
-            MSG1 = _mm_shuffle_epi8(MSG1, MASK);
-            E1 = _mm_sha1nexte_epu32(E1, MSG1);
-            E0 = ABCD;
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    /* Rounds 8-11 */
+    MSG2 = _mm_loadu_si128(block + 2);
+    MSG2 = _mm_shuffle_epi8(MSG2, MASK);
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
 
-            /* Rounds 8-11 */
-            MSG2 = _mm_loadu_si128((const __m128i*)(s->block + 32));
-            MSG2 = _mm_shuffle_epi8(MSG2, MASK);
-            E0 = _mm_sha1nexte_epu32(E0, MSG2);
-            E1 = ABCD;
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-            MSG0 = _mm_xor_si128(MSG0, MSG2);
+    /* Rounds 12-15 */
+    MSG3 = _mm_loadu_si128(block + 3);
+    MSG3 = _mm_shuffle_epi8(MSG3, MASK);
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
 
-            /* Rounds 12-15 */
-            MSG3 = _mm_loadu_si128((const __m128i*)(s->block + 48));
-            MSG3 = _mm_shuffle_epi8(MSG3, MASK);
-            E1 = _mm_sha1nexte_epu32(E1, MSG3);
-            E0 = ABCD;
-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-            MSG1 = _mm_xor_si128(MSG1, MSG3);
+    /* Rounds 16-19 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
 
-            /* Rounds 16-19 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG0);
-            E1 = ABCD;
-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-            MSG2 = _mm_xor_si128(MSG2, MSG0);
+    /* Rounds 20-23 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
 
-            /* Rounds 20-23 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG1);
-            E0 = ABCD;
-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-            MSG3 = _mm_xor_si128(MSG3, MSG1);
+    /* Rounds 24-27 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
 
-            /* Rounds 24-27 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG2);
-            E1 = ABCD;
-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-            MSG0 = _mm_xor_si128(MSG0, MSG2);
+    /* Rounds 28-31 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
 
-            /* Rounds 28-31 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG3);
-            E0 = ABCD;
-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-            MSG1 = _mm_xor_si128(MSG1, MSG3);
+    /* Rounds 32-35 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
 
-            /* Rounds 32-35 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG0);
-            E1 = ABCD;
-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-            MSG2 = _mm_xor_si128(MSG2, MSG0);
+    /* Rounds 36-39 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
 
-            /* Rounds 36-39 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG1);
-            E0 = ABCD;
-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-            MSG3 = _mm_xor_si128(MSG3, MSG1);
+    /* Rounds 40-43 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
 
-            /* Rounds 40-43 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG2);
-            E1 = ABCD;
-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-            MSG0 = _mm_xor_si128(MSG0, MSG2);
+    /* Rounds 44-47 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
 
-            /* Rounds 44-47 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG3);
-            E0 = ABCD;
-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-            MSG1 = _mm_xor_si128(MSG1, MSG3);
+    /* Rounds 48-51 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
 
-            /* Rounds 48-51 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG0);
-            E1 = ABCD;
-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-            MSG2 = _mm_xor_si128(MSG2, MSG0);
+    /* Rounds 52-55 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
+    MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
 
-            /* Rounds 52-55 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG1);
-            E0 = ABCD;
-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
-            MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
-            MSG3 = _mm_xor_si128(MSG3, MSG1);
+    /* Rounds 56-59 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
+    MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
+    MSG0 = _mm_xor_si128(MSG0, MSG2);
 
-            /* Rounds 56-59 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG2);
-            E1 = ABCD;
-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
-            MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
-            MSG0 = _mm_xor_si128(MSG0, MSG2);
+    /* Rounds 60-63 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
+    MSG1 = _mm_xor_si128(MSG1, MSG3);
 
-            /* Rounds 60-63 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG3);
-            E0 = ABCD;
-            MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-            MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
-            MSG1 = _mm_xor_si128(MSG1, MSG3);
+    /* Rounds 64-67 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG0);
+    E1 = ABCD;
+    MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+    MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
+    MSG2 = _mm_xor_si128(MSG2, MSG0);
 
-            /* Rounds 64-67 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG0);
-            E1 = ABCD;
-            MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
-            MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
-            MSG2 = _mm_xor_si128(MSG2, MSG0);
+    /* Rounds 68-71 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG1);
+    E0 = ABCD;
+    MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
+    MSG3 = _mm_xor_si128(MSG3, MSG1);
 
-            /* Rounds 68-71 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG1);
-            E0 = ABCD;
-            MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-            MSG3 = _mm_xor_si128(MSG3, MSG1);
+    /* Rounds 72-75 */
+    E0 = _mm_sha1nexte_epu32(E0, MSG2);
+    E1 = ABCD;
+    MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
 
-            /* Rounds 72-75 */
-            E0 = _mm_sha1nexte_epu32(E0, MSG2);
-            E1 = ABCD;
-            MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
+    /* Rounds 76-79 */
+    E1 = _mm_sha1nexte_epu32(E1, MSG3);
+    E0 = ABCD;
+    ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
 
-            /* Rounds 76-79 */
-            E1 = _mm_sha1nexte_epu32(E1, MSG3);
-            E0 = ABCD;
-            ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
-
-            /* Combine state */
-            E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
-            ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
-
-            s->blkused = 0;
-        }
-
-        ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
-
-        /* Save state */
-        _mm_storeu_si128((__m128i*) s->h, ABCD);
-        s->h[4] = _mm_extract_epi32(E0, 3);
-
-        memcpy(s->block, q, len);
-        s->blkused = len;
-    }
+    /* Combine state */
+    core[0] = _mm_add_epi32(ABCD, core[0]);
+    core[1] = _mm_sha1nexte_epu32(E0, core[1]);
 }
 
-/*
- * Workaround LLVM bug https://bugs.llvm.org/show_bug.cgi?id=34980
+typedef struct sha1_ni {
+    /*
+     * core[0] stores the first four words of the SHA-1 state. core[1]
+     * stores just the fifth word, in the vector lane at the highest
+     * address.
+     */
+    __m128i core[2];
+    sha1_block blk;
+    void *pointer_to_free;
+    BinarySink_IMPLEMENTATION;
+    ssh_hash hash;
+} sha1_ni;
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len);
+
+static sha1_ni *sha1_ni_alloc(void)
+{
+    /*
+     * The __m128i variables in the context structure need to be
+     * 16-byte aligned, but not all malloc implementations that this
+     * code has to work with will guarantee to return a 16-byte
+     * aligned pointer. So we over-allocate, manually realign the
+     * pointer ourselves, and store the original one inside the
+     * context so we know how to free it later.
+     */
+    void *allocation = smalloc(sizeof(sha1_ni) + 15);
+    uintptr_t alloc_address = (uintptr_t)allocation;
+    uintptr_t aligned_address = (alloc_address + 15) & ~15;
+    sha1_ni *s = (sha1_ni *)aligned_address;
+    s->pointer_to_free = allocation;
+    return s;
+}
+
+FUNC_ISA static ssh_hash *sha1_ni_new(const ssh_hashalg *alg)
+{
+    if (!sha1_hw_available_cached())
+        return NULL;
+
+    sha1_ni *s = sha1_ni_alloc();
+
+    /* Initialise the core vectors in their storage order */
+    s->core[0] = _mm_set_epi64x(
+        0x67452301efcdab89ULL, 0x98badcfe10325476ULL);
+    s->core[1] = _mm_set_epi32(0xc3d2e1f0, 0, 0, 0);
+
+    sha1_block_setup(&s->blk);
+
+    s->hash.vt = alg;
+    BinarySink_INIT(s, sha1_ni_write);
+    BinarySink_DELEGATE_INIT(&s->hash, s);
+    return &s->hash;
+}
+
+static ssh_hash *sha1_ni_copy(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+    sha1_ni *copy = sha1_ni_alloc();
+
+    void *ptf_save = copy->pointer_to_free;
+    *copy = *s; /* structure copy */
+    copy->pointer_to_free = ptf_save;
+
+    BinarySink_COPIED(copy);
+    BinarySink_DELEGATE_INIT(&copy->hash, copy);
+
+    return &copy->hash;
+}
+
+static void sha1_ni_free(ssh_hash *hash)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    void *ptf = s->pointer_to_free;
+    smemclr(s, sizeof(*s));
+    sfree(ptf);
+}
+
+static void sha1_ni_write(BinarySink *bs, const void *vp, size_t len)
+{
+    sha1_ni *s = BinarySink_DOWNCAST(bs, sha1_ni);
+
+    while (len > 0)
+        if (sha1_block_write(&s->blk, &vp, &len))
+            sha1_ni_block(s->core, s->blk.block);
+}
+
+FUNC_ISA static void sha1_ni_final(ssh_hash *hash, uint8_t *digest)
+{
+    sha1_ni *s = container_of(hash, sha1_ni, hash);
+
+    sha1_block_pad(&s->blk, BinarySink_UPCAST(s));
+
+    /* Rearrange the first vector into its output order */
+    __m128i abcd = _mm_shuffle_epi32(s->core[0], 0x1B);
+
+    /* Byte-swap it into the output endianness */
+    const __m128i mask = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
+    abcd = _mm_shuffle_epi8(abcd, mask);
+
+    /* And store it */
+    _mm_storeu_si128((__m128i *)digest, abcd);
+
+    /* Finally, store the leftover word */
+    uint32_t e = _mm_extract_epi32(s->core[1], 3);
+    PUT_32BIT_MSB_FIRST(digest + 16, e);
+
+    sha1_ni_free(hash);
+}
+
+const ssh_hashalg ssh_sha1_hw = {
+    sha1_ni_new, sha1_ni_copy, sha1_ni_final, sha1_ni_free,
+    20, 64, "SHA-1",
+};
+
+/* ----------------------------------------------------------------------
+ * Stub functions if we have no hardware-accelerated SHA-1. In this
+ * case, sha1_hw_new returns NULL (though it should also never be
+ * selected by sha1_select, so the only thing that should even be
+ * _able_ to call it is testcrypt). As a result, the remaining vtable
+ * functions should never be called at all.
  */
-static void sha1_ni(SHA_State * s, const unsigned char *q, int len)
-{
-    sha1_ni_(s, q, len);
-}
 
-#else /* COMPILER_SUPPORTS_AES_NI */
+#elif HW_SHA1 == HW_SHA1_NONE
 
-static void sha1_ni(SHA_State * s, const unsigned char *q, int len)
-{
-    unreachable("sha1_ni not compiled in");
-}
-
-bool supports_sha_ni(void)
+static bool sha1_hw_available(void)
 {
     return false;
 }
 
-#endif  /* COMPILER_SUPPORTS_AES_NI */
+static ssh_hash *sha1_stub_new(const ssh_hashalg *alg)
+{
+    return NULL;
+}
+
+#define STUB_BODY { unreachable("Should never be called"); }
+
+static ssh_hash *sha1_stub_copy(ssh_hash *hash) STUB_BODY
+static void sha1_stub_free(ssh_hash *hash) STUB_BODY
+static void sha1_stub_final(ssh_hash *hash, uint8_t *digest) STUB_BODY
+
+const ssh_hashalg ssh_sha1_hw = {
+    sha1_stub_new, sha1_stub_copy, sha1_stub_final, sha1_stub_free,
+    20, 64, "SHA-1",
+};
+
+#endif /* HW_SHA1 */
diff --git a/test/cryptsuite.py b/test/cryptsuite.py
index 6216f4a7..9ff987a7 100755
--- a/test/cryptsuite.py
+++ b/test/cryptsuite.py
@@ -1172,6 +1172,46 @@ class crypt(MyTestBase):
         self.assertEqualBin(data2, expected_data2[:127])
         self.assertEqualBin(data3, expected_data3)
 
+    def testHashPadding(self):
+        # A consistency test for hashes that use MD5/SHA-1/SHA-2 style
+        # padding of the message into a whole number of fixed-size
+        # blocks. We test-hash a message of every length up to twice
+        # the block length, to make sure there's no off-by-1 error in
+        # the code that decides how much padding to put on.
+
+        # Source: generated using Python hashlib as an independent
+        # implementation. The function below will do it, called with
+        # parameters such as (hashlib.sha256,128).
+        #
+        # def gen_testcase(hashclass, maxlen):
+        #    return hashclass(b''.join(hashclass(text[:i]).digest()
+        #             for i in range(maxlen))).hexdigest()
+
+        text = """
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do
+eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad
+minim veniam, quis nostrud exercitation ullamco laboris nisi ut
+aliquip ex ea commodo consequat. Duis aute irure dolor in
+reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla
+pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
+culpa qui officia deserunt mollit anim id est laborum.
+        """.replace('\n', ' ').strip()
+
+        def test(hashname, maxlen, expected):
+            assert len(text) >= maxlen
+            buf = b''.join(hash_str(hashname, text[:i])
+                           for i in range(maxlen))
+            self.assertEqualBin(hash_str(hashname, buf), unhex(expected))
+
+        test('md5', 128, '8169d766cc3b8df182b3ce756ae19a15')
+        test('sha1', 128, '3691759577deb3b70f427763a9c15acb9dfc0259')
+        test('sha256', 128, 'ec539c4d678412c86c13ee4eb9452232'
+             '35d4eed3368d876fdf10c9df27396640')
+        test('sha512', 256,
+             'cb725b4b4ec0ac1174d69427b4d97848b7db4fc01181f99a8049a4d721862578'
+             'f91e026778bb2d389a9dd88153405189e6ba438b213c5387284103d2267fd055'
+        )
+
 class standard_test_vectors(MyTestBase):
     def testAES(self):
         def vector(cipher, key, plaintext, ciphertext):