From f9eed9654683ea7aebb1856eae5ec9b14a488106 Mon Sep 17 00:00:00 2001
From: "nelsonb%netscape.com" <nelsonb%netscape.com>
Date: Tue, 22 Aug 2000 01:57:34 +0000
Subject: [PATCH] Big changes for performance. - In mpi-priv.h, declare new 3
 argument versions of s_mp_add and s_mp_sub. Also declare new set of s_mpv_
 functions that operate on vectors (arrays) of mp_digits instead of on
 mp_ints.  These functions are candidates for implementation in assembler. -
 In mpi.c reimplement mp_add and mp_sub using the new 3arg functions.
 Implement 3 argument versions of s_mp_add and s_mp_sub. This eliminates all
 need for temporary variables in mp_add and mp_sub. Implement c language
 reference implementations of new s_mpv vector multiply and multiply and add
 functions.  Change mp_mul and mp_sqr so they no longer pre-zero the output
 variable.  It's no longer nececssary with the new s_mpv functions.  s_mp_pad
 no longer zeros out the new padded space. -In mpmontg.c, implement variable
 width exponetiation windows.  Implement a new function to compute the
 multiply and Montgomery reduction in a single pass.  This is "Improvement 2"
 from Dusse' and Kaliski's paper "A Cryptographic Library for the Motorola
 DSP56000".  Performance impact is negligible in this c implementation. 
 However, this function is another target for assembly language optimization.

---
 security/nss/lib/freebl/mpi/mpi-priv.h |  26 +-
 security/nss/lib/freebl/mpi/mpi.c      | 346 ++++++++++++++++---------
 security/nss/lib/freebl/mpi/mpi.h      |   7 +-
 security/nss/lib/freebl/mpi/mpmontg.c  | 312 ++++++++++++++--------
 4 files changed, 456 insertions(+), 235 deletions(-)

diff --git a/security/nss/lib/freebl/mpi/mpi-priv.h b/security/nss/lib/freebl/mpi/mpi-priv.h
index f12720bd1e2..d138d9280b7 100644
--- a/security/nss/lib/freebl/mpi/mpi-priv.h
+++ b/security/nss/lib/freebl/mpi/mpi-priv.h
@@ -1,8 +1,11 @@
 /*
- *  mpi-priv.h
- *
+ *  mpi-priv.h	- Private header file for MPI 
  *  Arbitrary precision integer arithmetic library
  *
+ *  NOTE WELL: the content of this header file is NOT part of the "public"
+ *  API for the MPI library, and may change at any time.  
+ *  Application programs that use libmpi should NOT include this header file.
+ *
  * The contents of this file are subject to the Mozilla Public
  * License Version 1.1 (the "License"); you may not use this file
  * except in compliance with the License. You may obtain a copy of
@@ -35,7 +38,7 @@
  * the GPL.  If you do not delete the provisions above, a recipient
  * may use your version of this file under either the MPL or the GPL.
  *
- *  $Id: mpi-priv.h,v 1.8 2000-08-09 20:42:18 nelsonb%netscape.com Exp $
+ *  $Id: mpi-priv.h,v 1.9 2000-08-22 01:57:33 nelsonb%netscape.com Exp $
  */
 #ifndef _MPI_PRIV_H_
 #define _MPI_PRIV_H_ 1
@@ -142,6 +145,7 @@ extern const float s_logv_2[];
  void     s_mp_free(void *ptr);                   /* general free function */
 extern unsigned long mp_allocs;
 extern unsigned long mp_frees;
+extern unsigned long mp_copies;
 #else
 
  /* Even if these are defined as macros, we need to respect the settings
@@ -198,9 +202,11 @@ mp_err   s_mp_mod_d(mp_int *mp, mp_digit d, mp_digit *r);
 mp_err   s_mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu);
                                                /* Barrett reduction       */
 mp_err   s_mp_add(mp_int *a, const mp_int *b); /* magnitude addition      */
+mp_err   s_mp_add_3arg(const mp_int *a, const mp_int *b, mp_int *c);
+mp_err   s_mp_sub(mp_int *a, const mp_int *b); /* magnitude subtract      */
+mp_err   s_mp_sub_3arg(const mp_int *a, const mp_int *b, mp_int *c);
 mp_err   s_mp_add_offset(mp_int *a, mp_int *b, mp_size offset);
                                                /* a += b * RADIX^offset   */
-mp_err   s_mp_sub(mp_int *a, const mp_int *b); /* magnitude subtract      */
 mp_err   s_mp_mul(mp_int *a, const mp_int *b); /* magnitude multiply      */
 #if MP_SQUARE
 mp_err   s_mp_sqr(mp_int *a);                  /* magnitude square        */
@@ -219,11 +225,17 @@ int      s_mp_tovalue(char ch, int r);          /* convert ch to value    */
 char     s_mp_todigit(mp_digit val, int r, int low); /* convert val to digit */
 int      s_mp_outlen(int bits, int r);          /* output length in bytes */
 mp_digit s_mp_invmod_32b(mp_digit P);   /* returns (P ** -1) mod (2 ** 32) */
-void    s_mp_mul_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c);
-                                       /* c += a * b * (MP_RADIX ** offset);  */
 
+/* ------ mpv functions, operate on arrays of digits, not on mp_int's ------ */
+void     s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c);
+void     s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, 
+			 mp_digit *c);
+void     s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, 
+			      mp_digit *c);
+
+/* c += a * b * (MP_RADIX ** offset);  */
 #define s_mp_mul_d_add_offset(a, b, c, off) \
-  (s_mp_mul_add(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY)
+(s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY)
 
 /* }}} */
 #endif
diff --git a/security/nss/lib/freebl/mpi/mpi.c b/security/nss/lib/freebl/mpi/mpi.c
index ff687c58a80..2f8d9928674 100644
--- a/security/nss/lib/freebl/mpi/mpi.c
+++ b/security/nss/lib/freebl/mpi/mpi.c
@@ -35,7 +35,7 @@
  * the GPL.  If you do not delete the provisions above, a recipient
  * may use your version of this file under either the MPL or the GPL.
  *
- *  $Id: mpi.c,v 1.18 2000-08-11 01:58:20 nelsonb%netscape.com Exp $
+ *  $Id: mpi.c,v 1.19 2000-08-22 01:57:34 nelsonb%netscape.com Exp $
  */
 
 #include "mpi-priv.h"
@@ -81,8 +81,8 @@ static const char *s_dmap_1 =
 
 unsigned long mp_allocs;
 unsigned long mp_frees;
+unsigned long mp_copies;
 
-#define MP_CHECKOK(x) if (MP_OKAY > (res = (x))) goto CLEANUP
 
 /* {{{ Default precision manipulation */
 
@@ -200,6 +200,7 @@ mp_err mp_copy(const mp_int *from, mp_int *to)
   if(from == to)
     return MP_OKAY;
 
+  ++mp_copies;
   { /* copy */
     mp_digit   *tmp;
 
@@ -702,38 +703,22 @@ mp_err mp_neg(const mp_int *a, mp_int *b)
 
 mp_err mp_add(const mp_int *a, const mp_int *b, mp_int *c)
 {
-  mp_int  tmp;
   mp_err  res;
 
   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
 
-  MP_DIGITS(&tmp) = 0;
-  if (c == a) {
-    MP_CHECKOK( mp_init_copy(&tmp, a) );
-    a = &tmp;
-    if (c == b)
-      b = &tmp;
-  } else if (c == b) {
-    MP_CHECKOK( mp_init_copy(&tmp, b) );
-    b = &tmp;
-  }
-
   if(SIGN(a) == SIGN(b)) { /* same sign:  add values, keep sign */
-    MP_CHECKOK( mp_copy(a, c)  );
-    MP_CHECKOK( s_mp_add(c, b) );
+    MP_CHECKOK( s_mp_add_3arg(a, b, c) );
   } else if(s_mp_cmp(a, b) >= 0) {  /* different sign: a >= b   */
-    MP_CHECKOK( mp_copy(a, c)  );
-    MP_CHECKOK( s_mp_sub(c, b) );
+    MP_CHECKOK( s_mp_sub_3arg(a, b, c) );
   } else {                          /* different sign: a < b    */
-    MP_CHECKOK( mp_copy(b, c)  );
-    MP_CHECKOK( s_mp_sub(c, a) );
+    MP_CHECKOK( s_mp_sub_3arg(b, a, c) );
   }
 
   if (s_mp_cmp_d(c, 0) == MP_EQ)
     SIGN(c) = ZPOS;
 
 CLEANUP:
-  mp_clear(&tmp);
   return res;
 
 } /* end mp_add() */
@@ -750,37 +735,25 @@ CLEANUP:
 
 mp_err mp_sub(const mp_int *a, const mp_int *b, mp_int *c)
 {
-  mp_int  tmp;
   mp_err  res;
-  int     diffSign;
+  int     magDiff;
 
   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
 
-  MP_DIGITS(&tmp) = 0;
   if (a == b) {
     mp_zero(c);
     return MP_OKAY;
   }
-  if (c == a) {
-    MP_CHECKOK( mp_init_copy(&tmp, a) );
-    a = &tmp;
-  } else if (c == b) {
-    MP_CHECKOK( mp_init_copy(&tmp, b) );
-    b = &tmp;
-  }
 
   if (MP_SIGN(a) != MP_SIGN(b)) {
-    MP_CHECKOK( mp_copy(a, c)  );
-    MP_CHECKOK( s_mp_add(c, b) );
-  } else if (!(diffSign = s_mp_cmp(a, b))) {
+    MP_CHECKOK( s_mp_add_3arg(a, b, c) );
+  } else if (!(magDiff = s_mp_cmp(a, b))) {
     mp_zero(c);
     res = MP_OKAY;
-  } else if (diffSign > 0) {
-    MP_CHECKOK( mp_copy(a, c)  );
-    MP_CHECKOK( s_mp_sub(c, b) );
+  } else if (magDiff > 0) {
+    MP_CHECKOK( s_mp_sub_3arg(a, b, c) );
   } else {
-    MP_CHECKOK( mp_copy(b, c)  );
-    MP_CHECKOK( s_mp_sub(c, a) );
+    MP_CHECKOK( s_mp_sub_3arg(b, a, c) );
     MP_SIGN(c) = !MP_SIGN(a);
   }
 
@@ -788,7 +761,6 @@ mp_err mp_sub(const mp_int *a, const mp_int *b, mp_int *c)
     MP_SIGN(c) = MP_ZPOS;
 
 CLEANUP:
-  mp_clear(&tmp);
   return res;
 
 } /* end mp_sub() */
@@ -808,6 +780,7 @@ mp_err   mp_mul(const mp_int *a, const mp_int *b, mp_int * c)
   mp_int   tmp;
   mp_err   res;
   mp_size  ib;
+  mp_size  useda, usedb;
 
   ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
 
@@ -831,21 +804,24 @@ mp_err   mp_mul(const mp_int *a, const mp_int *b, mp_int * c)
     a = xch;
   }
 
-  /* This has the effect of left-padding with zeroes... */
   MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;
   if((res = s_mp_pad(c, USED(a) + USED(b))) != MP_OKAY)
     goto CLEANUP;
 
   pb = MP_DIGITS(b);
+  s_mpv_mul_d(MP_DIGITS(a), MP_USED(a), *pb++, MP_DIGITS(c));
+
   /* Outer loop:  Digits of b */
-  for(ib = 0; ib < USED(b); ib++) {
+  useda = MP_USED(a);
+  usedb = MP_USED(b);
+  for (ib = 1; ib < usedb; ib++) {
     mp_digit b_i    = *pb++;
 
-    if(b_i == 0)
-      continue;
-
     /* Inner product:  Digits of a */
-    s_mp_mul_d_add_offset(a, b_i, c, ib);
+    if (b_i)
+      s_mpv_mul_d_add(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);
+    else
+      MP_DIGIT(c, ib + useda) = b_i;
   }
 
   s_mp_clamp(c);
@@ -876,12 +852,13 @@ CLEANUP:
 /* sqr = a^2;   Caller provides both a and tmp; */
 mp_err   mp_sqr(const mp_int *a, mp_int *sqr)
 {
-  mp_digit *pa, *ps, *alim;
+  mp_digit *pa, *ps;
   mp_word  w; 
-  mp_digit d, k;
+  mp_digit d;
   mp_err   res;
   mp_size  ix;
   mp_int   tmp;
+  int      count;
 
   ARGCHK(a != NULL && sqr != NULL, MP_BADARG);
 
@@ -893,56 +870,32 @@ mp_err   mp_sqr(const mp_int *a, mp_int *sqr)
     DIGITS(&tmp) = 0;
   }
 
-  /* Left-pad with zeroes */
-  MP_USED(sqr) = 1; MP_DIGIT(sqr, 0) = 0;
-  if((res = s_mp_pad(sqr, 2 * USED(a))) != MP_OKAY)
-    goto CLEANUP;
-
-  /*
-    The inner product is computed as:
-       (C, S) = t[i,j] + 2 a[i] a[j] + C
-   */
+  ix = 2 * MP_USED(a);
+  if (ix > MP_ALLOC(sqr)) {
+    MP_USED(sqr) = 1; 
+    MP_CHECKOK( s_mp_grow(sqr, ix) );
+  } 
+  MP_USED(sqr) = ix;
+  MP_DIGIT(sqr, 0) = 0;
 
   pa = MP_DIGITS(a);
-  alim = pa + MP_USED(a);
-  for (ix = 0; pa < alim; ++ix) {
+  count = MP_USED(a) - 1;
+  if (count > 0) {
     d = *pa++;
-    ps = MP_DIGITS(sqr) + 1 + (ix << 1);
-    s_mp_mul_add(pa, alim - pa, d, ps);
-  } /* for(ix ...) */
+    s_mpv_mul_d(pa, count, d, MP_DIGITS(sqr) + 1);
+    for (ix = 3; --count > 0; ix += 2) {
+      d = *pa++;
+      s_mpv_mul_d_add(pa, count, d, MP_DIGITS(sqr) + ix);
+    } /* for(ix ...) */
+    MP_DIGIT(sqr, MP_USED(sqr)-1) = 0; /* above loop stopped short of this. */
 
-  /* now sqr *= 2 */
-  ps = MP_DIGITS(sqr);
-  k = 0;
-#define SHIFT_1_BIT(n) \
-    d = ps[n]; ps[n] = (d << 1) | k; k = d >> (DIGIT_BIT - 1)
-  for (ix = MP_USED(sqr); ix >= 8; ix -= 8) {
-    SHIFT_1_BIT(0);
-    SHIFT_1_BIT(1);
-    SHIFT_1_BIT(2);
-    SHIFT_1_BIT(3);
-    SHIFT_1_BIT(4);
-    SHIFT_1_BIT(5);
-    SHIFT_1_BIT(6);
-    SHIFT_1_BIT(7);
-    ps += 8;
-  }
-  if (ix) {
-    ps += ix;
-    switch (ix) {  /* all fallthru */
-    case 7: SHIFT_1_BIT(-7); /* FALLTHRU */
-    case 6: SHIFT_1_BIT(-6); /* FALLTHRU */
-    case 5: SHIFT_1_BIT(-5); /* FALLTHRU */
-    case 4: SHIFT_1_BIT(-4); /* FALLTHRU */
-    case 3: SHIFT_1_BIT(-3); /* FALLTHRU */
-    case 2: SHIFT_1_BIT(-2); /* FALLTHRU */
-    case 1: SHIFT_1_BIT(-1); /* FALLTHRU */
-    case 0: break;
-    }
+    /* now sqr *= 2 */
+    s_mp_mul_2(sqr);
+  } else {
+    MP_DIGIT(sqr, 1) = 0;
   }
 
   pa = MP_DIGITS(a);
-  alim = pa + MP_USED(a);
   ps = MP_DIGITS(sqr);
   w  = 0;
 #define ADD_SQUARE(n) \
@@ -2544,7 +2497,7 @@ mp_err   s_mp_pad(mp_int *mp, mp_size min)
       if ((res = s_mp_grow(mp, min)) != MP_OKAY)
 	return res;
     } else {
-      s_mp_setz(DIGITS(mp) + USED(mp), min - USED(mp));
+/*    s_mp_setz(DIGITS(mp) + USED(mp), min - USED(mp)); */
     }
 
     /* Increase precision; should already be 0-filled */
@@ -2770,12 +2723,15 @@ void     s_mp_rshd(mp_int *mp, mp_size p)
   for (ix = USED(mp) - p; ix > 0; ix--)
     *dst++ = *src++;
 
+  MP_USED(mp) -= p;
   /* Fill the top digits with zeroes */
   while (p-- > 0)
     *dst++ = 0;
 
+#if 0
   /* Strip off any leading zeroes    */
   s_mp_clamp(mp);
+#endif
 
 } /* end s_mp_rshd() */
 
@@ -2796,21 +2752,23 @@ void     s_mp_div_2(mp_int *mp)
 
 mp_err s_mp_mul_2(mp_int *mp)
 {
-  int      ix;
-  mp_digit kin = 0, kout;
-  mp_err   res;
+  mp_digit *pd;
+  int      ix, used;
+  mp_digit kin = 0;
 
   /* Shift digits leftward by 1 bit */
-  for(ix = 0; ix < USED(mp); ix++) {
-    kout = (DIGIT(mp, ix) >> (DIGIT_BIT - 1)) & 1;
-    DIGIT(mp, ix) = (DIGIT(mp, ix) << 1) | kin;
-
-    kin = kout;
+  used = MP_USED(mp);
+  pd = MP_DIGITS(mp);
+  for (ix = 0; ix < used; ix++) {
+    mp_digit d = *pd;
+    *pd++ = (d << 1) | kin;
+    kin = (d >> (DIGIT_BIT - 1));
   }
 
   /* Deal with rollover from last digit */
-  if(kin) {
-    if(ix >= ALLOC(mp)) {
+  if (kin) {
+    if (ix >= ALLOC(mp)) {
+      mp_err res;
       if((res = s_mp_grow(mp, ALLOC(mp) + 1)) != MP_OKAY)
 	return res;
     }
@@ -3139,8 +3097,10 @@ mp_err   s_mp_mod_d(mp_int *mp, mp_digit d, mp_digit *r)
 /* Compute a = |a| + |b|                                                  */
 mp_err   s_mp_add(mp_int *a, const mp_int *b)  /* magnitude addition      */
 {
-  mp_word   w, k = 0;
+  mp_digit *pa, *pb;
+  mp_word   w = 0;
   mp_size   ix;
+  mp_size   used;
   mp_err    res;
 
   /* Make sure a has enough precision for the output value */
@@ -3154,19 +3114,23 @@ mp_err   s_mp_add(mp_int *a, const mp_int *b)  /* magnitude addition      */
     less precision, we'll have to make sure the carry out is duly
     propagated upward among the higher-order digits of the sum.
    */
-  for(ix = 0; ix < USED(b); ix++) {
-    w = (mp_word)DIGIT(a, ix) + DIGIT(b, ix) + k;
-    DIGIT(a, ix) = ACCUM(w);
-    k = CARRYOUT(w);
+  pa = MP_DIGITS(a);
+  pb = MP_DIGITS(b);
+  used = MP_USED(b);
+  for(ix = 0; ix < used; ix++) {
+    w = w + *pa + *pb++;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w);
   }
 
   /* If we run out of 'b' digits before we're actually done, make
      sure the carries get propagated upward...  
    */
-  while(k && ix < USED(a)) {
-    w = (mp_word)DIGIT(a, ix) + k;
-    DIGIT(a, ix) = ACCUM(w);
-    k = CARRYOUT(w);
+  used = MP_USED(a);
+  while (w && ix < used) {
+    w = w + *pa;
+    *pa++ = ACCUM(w);
+    w = CARRYOUT(w);
     ++ix;
   }
 
@@ -3174,11 +3138,11 @@ mp_err   s_mp_add(mp_int *a, const mp_int *b)  /* magnitude addition      */
      it.  We could have done this initially, but why touch the memory
      allocator unless we're sure we have to?
    */
-  if(k) {
+  if (w) {
     if((res = s_mp_pad(a, USED(a) + 1)) != MP_OKAY)
       return res;
 
-    DIGIT(a, ix) = (mp_digit)k;
+    DIGIT(a, ix) = (mp_digit)w;
   }
 
   return MP_OKAY;
@@ -3187,6 +3151,69 @@ mp_err   s_mp_add(mp_int *a, const mp_int *b)  /* magnitude addition      */
 
 /* }}} */
 
+/* Compute c = |a| + |b|         */ /* magnitude addition      */
+mp_err   s_mp_add_3arg(const mp_int *a, const mp_int *b, mp_int *c)  
+{
+  mp_digit *pa, *pb, *pc;
+  mp_word   w = 0;
+  mp_size   ix;
+  mp_size   used;
+  mp_err    res;
+
+  MP_SIGN(c) = MP_SIGN(a);
+  if (MP_USED(a) < MP_USED(b)) {
+    const mp_int *xch = a;
+    a = b;
+    b = xch;
+  }
+
+  /* Make sure a has enough precision for the output value */
+  if (MP_OKAY != (res = s_mp_pad(c, MP_USED(a))))
+    return res;
+
+  /*
+    Add up all digits up to the precision of b.  If b had initially
+    the same precision as a, or greater, we took care of it by the
+    exchange step above, so there is no problem.  If b had initially
+    less precision, we'll have to make sure the carry out is duly
+    propagated upward among the higher-order digits of the sum.
+   */
+  pa = MP_DIGITS(a);
+  pb = MP_DIGITS(b);
+  pc = MP_DIGITS(c);
+  used = MP_USED(b);
+  for (ix = 0; ix < used; ix++) {
+    w = w + *pa++ + *pb++;
+    *pc++ = ACCUM(w);
+    w = CARRYOUT(w);
+  }
+
+  /* If we run out of 'b' digits before we're actually done, make
+     sure the carries get propagated upward...  
+   */
+  used = MP_USED(a);
+  while (ix < used) {
+    w = w + *pa++;
+    *pc++ = ACCUM(w);
+    w = CARRYOUT(w);
+    ++ix;
+  }
+
+  /* If there's an overall carry out, increase precision and include
+     it.  We could have done this initially, but why touch the memory
+     allocator unless we're sure we have to?
+   */
+  if (w) {
+    if((res = s_mp_pad(c, ix + 1)) != MP_OKAY)
+      return res;
+
+    DIGIT(c, ix) = (mp_digit)w;
+    ++ix;
+  }
+  MP_USED(c) = ix;
+  return MP_OKAY;
+
+}
 /* {{{ s_mp_add_offset(a, b, offset) */
 
 /* Compute a = |a| + ( |b| * (RADIX ** offset) )             */
@@ -3288,6 +3315,52 @@ mp_err   s_mp_sub(mp_int *a, const mp_int *b)  /* magnitude subtract      */
 
 /* }}} */
 
+/* Compute c = |a| - |b|, assumes |a| >= |b| */ /* magnitude subtract      */
+mp_err   s_mp_sub_3arg(const mp_int *a, const mp_int *b, mp_int *c)  
+{
+  mp_digit *pa, *pb, *pc;
+  mp_sword  w = 0;
+  int       ix, limit;
+  mp_err    res;
+
+  MP_SIGN(c) = MP_SIGN(a);
+
+  /* Make sure a has enough precision for the output value */
+  if (MP_OKAY != (res = s_mp_pad(c, MP_USED(a))))
+    return res;
+
+  /*
+    Subtract and propagate borrow.  Up to the precision of b, this
+    accounts for the digits of b; after that, we just make sure the
+    carries get to the right place.  This saves having to pad b out to
+    the precision of a just to make the loops work right...
+   */
+  pa = MP_DIGITS(a);
+  pb = MP_DIGITS(b);
+  pc = MP_DIGITS(c);
+  limit = MP_USED(b);
+  for (ix = 0; ix < limit; ++ix) {
+    w = w + *pa++ - *pb++;
+    *pc++ = ACCUM(w);
+    w >>= MP_DIGIT_BIT;
+  }
+  for (limit = MP_USED(a); ix < limit; ++ix) {
+    w = w + *pa++;
+    *pc++ = ACCUM(w);
+    w >>= MP_DIGIT_BIT;
+  }
+
+  /* Clobber any leading zeroes we created    */
+  MP_USED(c) = ix;
+  s_mp_clamp(c);
+
+  /* 
+     If there was a borrow out, then |b| > |a| in violation
+     of our input invariant.  We've already done the work,
+     but we'll at least complain about it...
+   */
+  return w ? MP_RANGE : MP_OKAY;
+}
 /* {{{ s_mp_mul(a, b) */
 
 /* Compute a = |a| * |b|                                                  */
@@ -3298,24 +3371,55 @@ mp_err   s_mp_mul(mp_int *a, const mp_int *b)
 
 /* }}} */
 
-/* c += a * b */
-void s_mp_mul_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
+#if !defined(MP_ASSEMBLY_MULTIPLY)
+/* c = a * b */
+void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
 {
-  mp_word   w = 0;
+  mp_digit   d = 0;
 
   /* Inner product:  Digits of a */
   while (a_len--) {
-    w += ((mp_word)b * *a++) + *c;
+    mp_word w = ((mp_word)b * *a++) + d;
     *c++ = ACCUM(w);
-    w = CARRYOUT(w);
+    d = CARRYOUT(w);
+  }
+  *c = d;
+}
+
+/* c += a * b */
+void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, 
+			      mp_digit *c)
+{
+  mp_digit   d = 0;
+
+  /* Inner product:  Digits of a */
+  while (a_len--) {
+    mp_word w = ((mp_word)b * *a++) + *c + d;
+    *c++ = ACCUM(w);
+    d = CARRYOUT(w);
+  }
+  *c = d;
+}
+
+/* c += a * b */
+void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
+{
+  mp_digit   d = 0;
+
+  /* Inner product:  Digits of a */
+  while (a_len--) {
+    mp_word w = ((mp_word)b * *a++) + *c + d;
+    *c++ = ACCUM(w);
+    d = CARRYOUT(w);
   }
 
-  while (w) {
-    w += *c;
+  while (d) {
+    mp_word w = (mp_word)*c + d;
     *c++ = ACCUM(w);
-    w = CARRYOUT(w);
+    d = CARRYOUT(w);
   }
 }
+#endif
 
 #if MP_SQUARE
 /* {{{ s_mp_sqr(a) */
diff --git a/security/nss/lib/freebl/mpi/mpi.h b/security/nss/lib/freebl/mpi/mpi.h
index b92f8ffec23..5851afcc212 100644
--- a/security/nss/lib/freebl/mpi/mpi.h
+++ b/security/nss/lib/freebl/mpi/mpi.h
@@ -36,7 +36,7 @@
  * may use your version of this file under either the MPL or the
  * GPL.
  *
- *  $Id: mpi.h,v 1.7 2000-08-04 19:57:24 nelsonb%netscape.com Exp $
+ *  $Id: mpi.h,v 1.8 2000-08-22 01:57:33 nelsonb%netscape.com Exp $
  */
 
 #ifndef _H_MPI_
@@ -68,7 +68,7 @@
 #define  MP_UNDEF        -5 /* answer is undefined   */
 #define  MP_LAST_CODE    MP_UNDEF
 
-typedef char              mp_sign;
+typedef unsigned int      mp_sign;
 typedef unsigned int      mp_size;
 typedef int               mp_err;
 
@@ -245,6 +245,9 @@ mp_err mp_to_unsigned_octets(const mp_int *mp, unsigned char *str, mp_size maxle
 mp_err mp_to_signed_octets(const mp_int *mp, unsigned char *str, mp_size maxlen);
 mp_err mp_to_fixlen_octets(const mp_int *mp, unsigned char *str, mp_size len);
 
+#define MP_CHECKOK(x)  if (MP_OKAY > (res = (x))) goto CLEANUP
+#define MP_CHECKERR(x) if (MP_OKAY > (res = (x))) goto CLEANUP
+
 #if defined(MP_API_COMPATIBLE)
 #define NEG             MP_NEG
 #define ZPOS            MP_ZPOS
diff --git a/security/nss/lib/freebl/mpi/mpmontg.c b/security/nss/lib/freebl/mpi/mpmontg.c
index 00dd74108b9..b14cbbe1101 100644
--- a/security/nss/lib/freebl/mpi/mpmontg.c
+++ b/security/nss/lib/freebl/mpi/mpmontg.c
@@ -29,7 +29,7 @@
  * the GPL.  If you do not delete the provisions above, a recipient
  * may use your version of this file under either the MPL or the
  * GPL.
- *  $Id: mpmontg.c,v 1.6 2000-08-08 03:20:35 nelsonb%netscape.com Exp $
+ *  $Id: mpmontg.c,v 1.7 2000-08-22 01:57:34 nelsonb%netscape.com Exp $
  */
 
 /* This file implements moduluar exponentiation using Montgomery's
@@ -46,13 +46,11 @@
 #include "mplogic.h"
 #include "mpprime.h"
 
-#define MP_CHECKOK(x) if (MP_OKAY != (rv = (x))) goto loser
-#define MP_CHECKERR(x) if (0 > (rv = (x))) goto loser
 #define STATIC
 /* #define DEBUG 1  */
 
-#define WINDOW_BITS 5
-#define ODD_INTS    16   /* 2 ** (WINDOW_BITS - 1) */
+#define MAX_WINDOW_BITS 6
+#define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
 
 typedef struct {
   mp_int       N;	/* modulus N */
@@ -60,17 +58,22 @@ typedef struct {
   mp_size      b;	/* R == 2 ** b,  also b = # significant bits in N */
 } mp_mont_modulus;
 
+mp_err   s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, 
+	               mp_mont_modulus *mmm);
+
 /* computes T = REDC(T), 2^b == R */
 STATIC
 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
 {
-  mp_err rv;
-  int i;
+  mp_err res;
+  mp_size i;
 #ifdef DEBUG
   mp_int m;
+  MP_DIGITS(&m) = 0;
 #endif
 
-  MP_CHECKOK( s_mp_pad(T, MP_USED(T) + MP_USED(&mmm->N) + 2) );
+  i = MP_USED(T) + MP_USED(&mmm->N) + 2;
+  MP_CHECKOK( s_mp_pad(T, i) );
   for (i = 0; i < MP_USED(&mmm->N); ++i ) {
     mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;
     /* T += N * m_i * (MP_RADIX ** i); */
@@ -84,39 +87,100 @@ mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
   MP_CHECKOK( mp_div_2d(T, mmm->b, T, &m));
   /* here, remainder m should be equal to zero */
   if (mp_cmp_z(&m) != 0) {
-    rv = MP_UNDEF;
-    goto loser;
+    res = MP_UNDEF;
+    goto CLEANUP;
   }
 #else
   s_mp_div_2d(T, mmm->b); 
 #endif
 
-  if ((rv = s_mp_cmp(T, &mmm->N)) >= 0) {
+  if ((res = s_mp_cmp(T, &mmm->N)) >= 0) {
     /* T = T - N */
     MP_CHECKOK( s_mp_sub(T, &mmm->N) );
 #ifdef DEBUG
-    if ((rv = mp_cmp(T, &mmm->N)) >= 0) {
-      rv = MP_UNDEF;
+    if ((res = mp_cmp(T, &mmm->N)) >= 0) {
+      res = MP_UNDEF;
+      goto CLEANUP;
     }
 #endif
   }
-  rv = MP_OKAY;
-loser:
+  res = MP_OKAY;
+CLEANUP:
 #ifdef DEBUG
   mp_clear(&m);
 #endif
-  return rv;
+  return res;
 }
 
-mp_err mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)
+#if !defined(MP_ASSEMBLY_MUL_MONT) && !defined(MP_MONT_USE_MP_MUL)
+mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, 
+	           mp_mont_modulus *mmm)
 {
-  mp_err rv;
+  mp_digit *pb;
+  mp_digit m_i;
+  mp_err   res;
+  mp_size  ib;
+  mp_size  useda, usedb;
+
+  ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG);
+
+  if (MP_USED(a) < MP_USED(b)) {
+    const mp_int *xch = b;	/* switch a and b, to do fewer outer loops */
+    b = a;
+    a = xch;
+  }
+
+  MP_USED(c) = 1; MP_DIGIT(c, 0) = 0;
+  ib = MP_USED(a) + MP_MAX(MP_USED(b), MP_USED(&mmm->N)) + 2;
+  if((res = s_mp_pad(c, ib)) != MP_OKAY)
+    goto CLEANUP;
+
+  useda = MP_USED(a);
+  pb = MP_DIGITS(b);
+  s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c));
+  s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1));
+  m_i = MP_DIGIT(c, 0) * mmm->n0prime;
+  s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0);
+
+  /* Outer loop:  Digits of b */
+  usedb = MP_USED(b);
+  for (ib = 1; ib < usedb; ib++) {
+    mp_digit b_i    = *pb++;
+
+    /* Inner product:  Digits of a */
+    if (b_i)
+      s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib);
+    m_i = MP_DIGIT(c, ib) * mmm->n0prime;
+    s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
+  }
+  if (usedb < MP_USED(&mmm->N)) {
+    for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) {
+      m_i = MP_DIGIT(c, ib) * mmm->n0prime;
+      s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib);
+    }
+  }
+  s_mp_clamp(c);
+  s_mp_div_2d(c, mmm->b); 
+  if (s_mp_cmp(c, &mmm->N) >= 0) {
+    MP_CHECKOK( s_mp_sub(c, &mmm->N) );
+  }
+  res = MP_OKAY;
+
+CLEANUP:
+  return res;
+}
+#endif
+
+STATIC
+mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont)
+{
+  mp_err res;
 
   /* xMont = x * R mod N   where  N is modulus */
   MP_CHECKOK( mpl_lsh(x, xMont, mmm->b) );  		/* xMont = x << b */
   MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) );	/*         mod N */
-loser:
-  return rv;
+CLEANUP:
+  return res;
 }
 
 
@@ -124,28 +188,34 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
 		  const mp_int *modulus, mp_int *result)
 {
   const mp_int *base;
+  mp_int *pa1, *pa2, *ptmp;
   mp_size bits_in_exponent;
   mp_size i;
-  mp_err rv;
-  mp_int square, accum, goodBase, tmp;
+  mp_size window_bits, odd_ints;
+  mp_err res;
+  mp_int square, accum1, accum2, goodBase;
   mp_mont_modulus mmm;
 
   /* function for computing n0prime only works if n0 is odd */
   if (!mp_isodd(modulus))
     return s_mp_exptmod(inBase, exponent, modulus, result);
 
+  MP_DIGITS(&square) = 0;
+  MP_DIGITS(&accum1) = 0;
+  MP_DIGITS(&accum2) = 0;
+  MP_DIGITS(&goodBase) = 0;
+
   if (mp_cmp(inBase, modulus) < 0) {
     base = inBase;
-    MP_DIGITS(&goodBase) = 0;
   } else {
-    mp_init(&goodBase);
+    MP_CHECKOK( mp_init(&goodBase) );
     base = &goodBase;
     MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );
   }
 
-  mp_init_size(&square, 2 * MP_USED(modulus) + 2);
-  mp_init_size(&accum,  3 * MP_USED(modulus) + 2);
-  mp_init_size(&tmp,    3 * MP_USED(modulus) + 2);
+  MP_CHECKOK( mp_init_size(&square, 2 * MP_USED(modulus) + 2) );
+  MP_CHECKOK( mp_init_size(&accum1, 3 * MP_USED(modulus) + 2) );
+  MP_CHECKOK( mp_init_size(&accum2, 3 * MP_USED(modulus) + 2) );
 
   mmm.N = *modulus;			/* a copy of the mp_int struct */
   i = mpl_significant_bits(modulus);
@@ -157,18 +227,25 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
   */
   mmm.n0prime = 0 - s_mp_invmod_32b( MP_DIGIT(modulus, 0) );
 
-  MP_CHECKOK( mp_to_mont(base, &mmm, &square) );
+  MP_CHECKOK( s_mp_to_mont(base, &mmm, &square) );
 
   bits_in_exponent = mpl_significant_bits(exponent);
-  i = bits_in_exponent % WINDOW_BITS;
+  if (bits_in_exponent > 480)
+    window_bits = 6;
+  else if (bits_in_exponent > 160)
+    window_bits = 5;
+  else
+    window_bits = 4;
+  odd_ints = 1 << (window_bits - 1);
+  i = bits_in_exponent % window_bits;
   if (i != 0) {
-    bits_in_exponent += WINDOW_BITS - i;
+    bits_in_exponent += window_bits - i;
   } 
   {
     /* oddPowers[i] = base ** (2*i + 1); */
-    /* power2 = base ** 2; */
     int expOff;
-    mp_int power2, oddPowers[ODD_INTS];
+    /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
+    mp_int power2, oddPowers[MAX_ODD_INTS];
 
     MP_CHECKOK( mp_init_copy(oddPowers, &square) );
 
@@ -176,97 +253,122 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
     MP_CHECKOK( mp_sqr(&square, &power2) );	/* square = square ** 2 */
     MP_CHECKOK( s_mp_redc(&power2, &mmm) );
 
-    for (i = 1; i < ODD_INTS; ++i) {
+    for (i = 1; i < odd_ints; ++i) {
       mp_init_size(oddPowers + i, MP_USED(modulus) + 2 * MP_USED(&power2) + 2);
       MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );
       MP_CHECKOK( s_mp_redc(oddPowers + i, &mmm) );
     }
-    mp_set(&accum, 1);
-    MP_CHECKOK( mp_to_mont(&accum, &mmm, &accum) );
+    mp_set(&accum1, 1);
+    MP_CHECKOK( s_mp_to_mont(&accum1, &mmm, &accum1) );
+    pa1 = &accum1;
+    pa2 = &accum2;
 
-#define SQUARE \
-    MP_CHECKOK( mp_sqr(&accum, &tmp) );\
-    mp_exch(&accum, &tmp); \
-    MP_CHECKOK( s_mp_redc(&accum, &mmm) )
+#define SQR(a,b) \
+    MP_CHECKOK( mp_sqr(a, b) );\
+    MP_CHECKOK( s_mp_redc(b, &mmm) )
 
-#define MUL(x) \
-    MP_CHECKOK( mp_mul(&accum, oddPowers + (x), &tmp) ); \
-    mp_exch(&accum, &tmp); \
-    MP_CHECKOK( s_mp_redc(&accum, &mmm))
-
-    for (expOff = bits_in_exponent - WINDOW_BITS; expOff >= 0; expOff -= WINDOW_BITS) {
-      mp_size smallExp;
-      MP_CHECKERR( mpl_get_bits(exponent, expOff, WINDOW_BITS) );
-      smallExp = (mp_size)rv;
-
-#if WINDOW_BITS == 4
-      if (!smallExp) {
-		SQUARE; SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 1) {
-		SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2);
-      } else if (smallExp & 2) {
-		SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE;
-      } else if (smallExp & 4) {
-		SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE;
-      } else if (smallExp & 8) {
-		SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE;
-      } else {
-		abort();
-      }
-#elif WINDOW_BITS == 5
-      if (!smallExp) {
-		SQUARE; SQUARE; SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 1) {
-		SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2);
-      } else if (smallExp & 2) {
-		SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE;
-      } else if (smallExp & 4) {
-		SQUARE; SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE;
-      } else if (smallExp & 8) {
-		SQUARE; SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 0x10) {
-		SQUARE; MUL(smallExp/32); SQUARE; SQUARE; SQUARE; SQUARE;
-      } else {
-		abort();
-      }
-#elif WINDOW_BITS == 6
-      if (!smallExp) {
-	      SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 1) {
-	      SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2);
-      } else if (smallExp & 2) {
-	      SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE;
-      } else if (smallExp & 4) {
-	      SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE;
-      } else if (smallExp & 8) {
-	      SQUARE; SQUARE; SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 0x10) {
-	      SQUARE; SQUARE; MUL(smallExp/32); SQUARE; SQUARE; SQUARE; SQUARE;
-      } else if (smallExp & 0x20) {
-	      SQUARE; MUL(smallExp/64); SQUARE; SQUARE; SQUARE; SQUARE; SQUARE;
-      } else {
-		abort();
-      }
+#if defined(MP_MONT_USE_MP_MUL)
+#define MUL(x,a,b) \
+    MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \
+    MP_CHECKOK( s_mp_redc(b, &mmm) ) 
 #else
-#error "Unknown value for WINDOW_BITS"
+#define MUL(x,a,b) \
+    MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, &mmm) )
 #endif
+
+#define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
+
+    for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
+      mp_size smallExp;
+      MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
+      smallExp = (mp_size)res;
+
+      if (window_bits == 4) {
+	if (!smallExp) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
+	} else if (smallExp & 1) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  MUL(smallExp/2, pa1,pa2); SWAPPA;
+	} else if (smallExp & 2) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
+	  MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 4) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 8) {
+	  SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else {
+	  abort();
+	}
+      } else if (window_bits == 5) {
+	if (!smallExp) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 1) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1);
+	} else if (smallExp & 2) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1);
+	} else if (smallExp & 4) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
+	  MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
+	} else if (smallExp & 8) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
+	} else if (smallExp & 0x10) {
+	  SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1);
+	} else {
+	    abort();
+	}
+      } else if (window_bits == 6) {
+	if (!smallExp) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); SQR(pa2,pa1);
+	} else if (smallExp & 1) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA;
+	} else if (smallExp & 2) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 4) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 8) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); 
+	  MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); 
+	  SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 0x10) {
+	  SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else if (smallExp & 0x20) {
+	  SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2); 
+	  SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA;
+	} else {
+	  abort();
+	}
+      } else {
+	abort();
+      }
     }
 
     mp_clear(&power2);
-    for (i = 0; i < ODD_INTS; ++i) {
+    for (i = 0; i < odd_ints; ++i) {
       mp_clear(oddPowers + i);
     }
   }
-  rv = s_mp_redc(&accum, &mmm);
-  mp_exch(&accum, result);
-loser:
+  res = s_mp_redc(pa1, &mmm);
+  mp_exch(pa1, result);
+CLEANUP:
   mp_clear(&square);
-  mp_clear(&accum);
+  mp_clear(&accum1);
+  mp_clear(&accum2);
   mp_clear(&goodBase);
-  mp_clear(&tmp);
   /* Don't mp_clear mmm.N because it is merely a copy of modulus.
   ** Just zap it.
   */
   memset(&mmm, 0, sizeof mmm);
-  return rv;
+  return res;
 }