From f9eed9654683ea7aebb1856eae5ec9b14a488106 Mon Sep 17 00:00:00 2001 From: "nelsonb%netscape.com" Date: Tue, 22 Aug 2000 01:57:34 +0000 Subject: [PATCH] Big changes for performance. - In mpi-priv.h, declare new 3 argument versions of s_mp_add and s_mp_sub. Also declare new set of s_mpv_ functions that operate on vectors (arrays) of mp_digits instead of on mp_ints. These functions are candidates for implementation in assembler. - In mpi.c reimplement mp_add and mp_sub using the new 3arg functions. Implement 3 argument versions of s_mp_add and s_mp_sub. This eliminates all need for temporary variables in mp_add and mp_sub. Implement c language reference implementations of new s_mpv vector multiply and multiply and add functions. Change mp_mul and mp_sqr so they no longer pre-zero the output variable. It's no longer nececssary with the new s_mpv functions. s_mp_pad no longer zeros out the new padded space. -In mpmontg.c, implement variable width exponetiation windows. Implement a new function to compute the multiply and Montgomery reduction in a single pass. This is "Improvement 2" from Dusse' and Kaliski's paper "A Cryptographic Library for the Motorola DSP56000". Performance impact is negligible in this c implementation. However, this function is another target for assembly language optimization. --- security/nss/lib/freebl/mpi/mpi-priv.h | 26 +- security/nss/lib/freebl/mpi/mpi.c | 346 ++++++++++++++++--------- security/nss/lib/freebl/mpi/mpi.h | 7 +- security/nss/lib/freebl/mpi/mpmontg.c | 312 ++++++++++++++-------- 4 files changed, 456 insertions(+), 235 deletions(-) diff --git a/security/nss/lib/freebl/mpi/mpi-priv.h b/security/nss/lib/freebl/mpi/mpi-priv.h index f12720bd1e2..d138d9280b7 100644 --- a/security/nss/lib/freebl/mpi/mpi-priv.h +++ b/security/nss/lib/freebl/mpi/mpi-priv.h @@ -1,8 +1,11 @@ /* - * mpi-priv.h - * + * mpi-priv.h - Private header file for MPI * Arbitrary precision integer arithmetic library * + * NOTE WELL: the content of this header file is NOT part of the "public" + * API for the MPI library, and may change at any time. + * Application programs that use libmpi should NOT include this header file. + * * The contents of this file are subject to the Mozilla Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of @@ -35,7 +38,7 @@ * the GPL. If you do not delete the provisions above, a recipient * may use your version of this file under either the MPL or the GPL. * - * $Id: mpi-priv.h,v 1.8 2000-08-09 20:42:18 nelsonb%netscape.com Exp $ + * $Id: mpi-priv.h,v 1.9 2000-08-22 01:57:33 nelsonb%netscape.com Exp $ */ #ifndef _MPI_PRIV_H_ #define _MPI_PRIV_H_ 1 @@ -142,6 +145,7 @@ extern const float s_logv_2[]; void s_mp_free(void *ptr); /* general free function */ extern unsigned long mp_allocs; extern unsigned long mp_frees; +extern unsigned long mp_copies; #else /* Even if these are defined as macros, we need to respect the settings @@ -198,9 +202,11 @@ mp_err s_mp_mod_d(mp_int *mp, mp_digit d, mp_digit *r); mp_err s_mp_reduce(mp_int *x, const mp_int *m, const mp_int *mu); /* Barrett reduction */ mp_err s_mp_add(mp_int *a, const mp_int *b); /* magnitude addition */ +mp_err s_mp_add_3arg(const mp_int *a, const mp_int *b, mp_int *c); +mp_err s_mp_sub(mp_int *a, const mp_int *b); /* magnitude subtract */ +mp_err s_mp_sub_3arg(const mp_int *a, const mp_int *b, mp_int *c); mp_err s_mp_add_offset(mp_int *a, mp_int *b, mp_size offset); /* a += b * RADIX^offset */ -mp_err s_mp_sub(mp_int *a, const mp_int *b); /* magnitude subtract */ mp_err s_mp_mul(mp_int *a, const mp_int *b); /* magnitude multiply */ #if MP_SQUARE mp_err s_mp_sqr(mp_int *a); /* magnitude square */ @@ -219,11 +225,17 @@ int s_mp_tovalue(char ch, int r); /* convert ch to value */ char s_mp_todigit(mp_digit val, int r, int low); /* convert val to digit */ int s_mp_outlen(int bits, int r); /* output length in bytes */ mp_digit s_mp_invmod_32b(mp_digit P); /* returns (P ** -1) mod (2 ** 32) */ -void s_mp_mul_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c); - /* c += a * b * (MP_RADIX ** offset); */ +/* ------ mpv functions, operate on arrays of digits, not on mp_int's ------ */ +void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c); +void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, + mp_digit *c); +void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, + mp_digit *c); + +/* c += a * b * (MP_RADIX ** offset); */ #define s_mp_mul_d_add_offset(a, b, c, off) \ - (s_mp_mul_add(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY) +(s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY) /* }}} */ #endif diff --git a/security/nss/lib/freebl/mpi/mpi.c b/security/nss/lib/freebl/mpi/mpi.c index ff687c58a80..2f8d9928674 100644 --- a/security/nss/lib/freebl/mpi/mpi.c +++ b/security/nss/lib/freebl/mpi/mpi.c @@ -35,7 +35,7 @@ * the GPL. If you do not delete the provisions above, a recipient * may use your version of this file under either the MPL or the GPL. * - * $Id: mpi.c,v 1.18 2000-08-11 01:58:20 nelsonb%netscape.com Exp $ + * $Id: mpi.c,v 1.19 2000-08-22 01:57:34 nelsonb%netscape.com Exp $ */ #include "mpi-priv.h" @@ -81,8 +81,8 @@ static const char *s_dmap_1 = unsigned long mp_allocs; unsigned long mp_frees; +unsigned long mp_copies; -#define MP_CHECKOK(x) if (MP_OKAY > (res = (x))) goto CLEANUP /* {{{ Default precision manipulation */ @@ -200,6 +200,7 @@ mp_err mp_copy(const mp_int *from, mp_int *to) if(from == to) return MP_OKAY; + ++mp_copies; { /* copy */ mp_digit *tmp; @@ -702,38 +703,22 @@ mp_err mp_neg(const mp_int *a, mp_int *b) mp_err mp_add(const mp_int *a, const mp_int *b, mp_int *c) { - mp_int tmp; mp_err res; ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); - MP_DIGITS(&tmp) = 0; - if (c == a) { - MP_CHECKOK( mp_init_copy(&tmp, a) ); - a = &tmp; - if (c == b) - b = &tmp; - } else if (c == b) { - MP_CHECKOK( mp_init_copy(&tmp, b) ); - b = &tmp; - } - if(SIGN(a) == SIGN(b)) { /* same sign: add values, keep sign */ - MP_CHECKOK( mp_copy(a, c) ); - MP_CHECKOK( s_mp_add(c, b) ); + MP_CHECKOK( s_mp_add_3arg(a, b, c) ); } else if(s_mp_cmp(a, b) >= 0) { /* different sign: a >= b */ - MP_CHECKOK( mp_copy(a, c) ); - MP_CHECKOK( s_mp_sub(c, b) ); + MP_CHECKOK( s_mp_sub_3arg(a, b, c) ); } else { /* different sign: a < b */ - MP_CHECKOK( mp_copy(b, c) ); - MP_CHECKOK( s_mp_sub(c, a) ); + MP_CHECKOK( s_mp_sub_3arg(b, a, c) ); } if (s_mp_cmp_d(c, 0) == MP_EQ) SIGN(c) = ZPOS; CLEANUP: - mp_clear(&tmp); return res; } /* end mp_add() */ @@ -750,37 +735,25 @@ CLEANUP: mp_err mp_sub(const mp_int *a, const mp_int *b, mp_int *c) { - mp_int tmp; mp_err res; - int diffSign; + int magDiff; ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); - MP_DIGITS(&tmp) = 0; if (a == b) { mp_zero(c); return MP_OKAY; } - if (c == a) { - MP_CHECKOK( mp_init_copy(&tmp, a) ); - a = &tmp; - } else if (c == b) { - MP_CHECKOK( mp_init_copy(&tmp, b) ); - b = &tmp; - } if (MP_SIGN(a) != MP_SIGN(b)) { - MP_CHECKOK( mp_copy(a, c) ); - MP_CHECKOK( s_mp_add(c, b) ); - } else if (!(diffSign = s_mp_cmp(a, b))) { + MP_CHECKOK( s_mp_add_3arg(a, b, c) ); + } else if (!(magDiff = s_mp_cmp(a, b))) { mp_zero(c); res = MP_OKAY; - } else if (diffSign > 0) { - MP_CHECKOK( mp_copy(a, c) ); - MP_CHECKOK( s_mp_sub(c, b) ); + } else if (magDiff > 0) { + MP_CHECKOK( s_mp_sub_3arg(a, b, c) ); } else { - MP_CHECKOK( mp_copy(b, c) ); - MP_CHECKOK( s_mp_sub(c, a) ); + MP_CHECKOK( s_mp_sub_3arg(b, a, c) ); MP_SIGN(c) = !MP_SIGN(a); } @@ -788,7 +761,6 @@ mp_err mp_sub(const mp_int *a, const mp_int *b, mp_int *c) MP_SIGN(c) = MP_ZPOS; CLEANUP: - mp_clear(&tmp); return res; } /* end mp_sub() */ @@ -808,6 +780,7 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int * c) mp_int tmp; mp_err res; mp_size ib; + mp_size useda, usedb; ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); @@ -831,21 +804,24 @@ mp_err mp_mul(const mp_int *a, const mp_int *b, mp_int * c) a = xch; } - /* This has the effect of left-padding with zeroes... */ MP_USED(c) = 1; MP_DIGIT(c, 0) = 0; if((res = s_mp_pad(c, USED(a) + USED(b))) != MP_OKAY) goto CLEANUP; pb = MP_DIGITS(b); + s_mpv_mul_d(MP_DIGITS(a), MP_USED(a), *pb++, MP_DIGITS(c)); + /* Outer loop: Digits of b */ - for(ib = 0; ib < USED(b); ib++) { + useda = MP_USED(a); + usedb = MP_USED(b); + for (ib = 1; ib < usedb; ib++) { mp_digit b_i = *pb++; - if(b_i == 0) - continue; - /* Inner product: Digits of a */ - s_mp_mul_d_add_offset(a, b_i, c, ib); + if (b_i) + s_mpv_mul_d_add(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib); + else + MP_DIGIT(c, ib + useda) = b_i; } s_mp_clamp(c); @@ -876,12 +852,13 @@ CLEANUP: /* sqr = a^2; Caller provides both a and tmp; */ mp_err mp_sqr(const mp_int *a, mp_int *sqr) { - mp_digit *pa, *ps, *alim; + mp_digit *pa, *ps; mp_word w; - mp_digit d, k; + mp_digit d; mp_err res; mp_size ix; mp_int tmp; + int count; ARGCHK(a != NULL && sqr != NULL, MP_BADARG); @@ -893,56 +870,32 @@ mp_err mp_sqr(const mp_int *a, mp_int *sqr) DIGITS(&tmp) = 0; } - /* Left-pad with zeroes */ - MP_USED(sqr) = 1; MP_DIGIT(sqr, 0) = 0; - if((res = s_mp_pad(sqr, 2 * USED(a))) != MP_OKAY) - goto CLEANUP; - - /* - The inner product is computed as: - (C, S) = t[i,j] + 2 a[i] a[j] + C - */ + ix = 2 * MP_USED(a); + if (ix > MP_ALLOC(sqr)) { + MP_USED(sqr) = 1; + MP_CHECKOK( s_mp_grow(sqr, ix) ); + } + MP_USED(sqr) = ix; + MP_DIGIT(sqr, 0) = 0; pa = MP_DIGITS(a); - alim = pa + MP_USED(a); - for (ix = 0; pa < alim; ++ix) { + count = MP_USED(a) - 1; + if (count > 0) { d = *pa++; - ps = MP_DIGITS(sqr) + 1 + (ix << 1); - s_mp_mul_add(pa, alim - pa, d, ps); - } /* for(ix ...) */ + s_mpv_mul_d(pa, count, d, MP_DIGITS(sqr) + 1); + for (ix = 3; --count > 0; ix += 2) { + d = *pa++; + s_mpv_mul_d_add(pa, count, d, MP_DIGITS(sqr) + ix); + } /* for(ix ...) */ + MP_DIGIT(sqr, MP_USED(sqr)-1) = 0; /* above loop stopped short of this. */ - /* now sqr *= 2 */ - ps = MP_DIGITS(sqr); - k = 0; -#define SHIFT_1_BIT(n) \ - d = ps[n]; ps[n] = (d << 1) | k; k = d >> (DIGIT_BIT - 1) - for (ix = MP_USED(sqr); ix >= 8; ix -= 8) { - SHIFT_1_BIT(0); - SHIFT_1_BIT(1); - SHIFT_1_BIT(2); - SHIFT_1_BIT(3); - SHIFT_1_BIT(4); - SHIFT_1_BIT(5); - SHIFT_1_BIT(6); - SHIFT_1_BIT(7); - ps += 8; - } - if (ix) { - ps += ix; - switch (ix) { /* all fallthru */ - case 7: SHIFT_1_BIT(-7); /* FALLTHRU */ - case 6: SHIFT_1_BIT(-6); /* FALLTHRU */ - case 5: SHIFT_1_BIT(-5); /* FALLTHRU */ - case 4: SHIFT_1_BIT(-4); /* FALLTHRU */ - case 3: SHIFT_1_BIT(-3); /* FALLTHRU */ - case 2: SHIFT_1_BIT(-2); /* FALLTHRU */ - case 1: SHIFT_1_BIT(-1); /* FALLTHRU */ - case 0: break; - } + /* now sqr *= 2 */ + s_mp_mul_2(sqr); + } else { + MP_DIGIT(sqr, 1) = 0; } pa = MP_DIGITS(a); - alim = pa + MP_USED(a); ps = MP_DIGITS(sqr); w = 0; #define ADD_SQUARE(n) \ @@ -2544,7 +2497,7 @@ mp_err s_mp_pad(mp_int *mp, mp_size min) if ((res = s_mp_grow(mp, min)) != MP_OKAY) return res; } else { - s_mp_setz(DIGITS(mp) + USED(mp), min - USED(mp)); +/* s_mp_setz(DIGITS(mp) + USED(mp), min - USED(mp)); */ } /* Increase precision; should already be 0-filled */ @@ -2770,12 +2723,15 @@ void s_mp_rshd(mp_int *mp, mp_size p) for (ix = USED(mp) - p; ix > 0; ix--) *dst++ = *src++; + MP_USED(mp) -= p; /* Fill the top digits with zeroes */ while (p-- > 0) *dst++ = 0; +#if 0 /* Strip off any leading zeroes */ s_mp_clamp(mp); +#endif } /* end s_mp_rshd() */ @@ -2796,21 +2752,23 @@ void s_mp_div_2(mp_int *mp) mp_err s_mp_mul_2(mp_int *mp) { - int ix; - mp_digit kin = 0, kout; - mp_err res; + mp_digit *pd; + int ix, used; + mp_digit kin = 0; /* Shift digits leftward by 1 bit */ - for(ix = 0; ix < USED(mp); ix++) { - kout = (DIGIT(mp, ix) >> (DIGIT_BIT - 1)) & 1; - DIGIT(mp, ix) = (DIGIT(mp, ix) << 1) | kin; - - kin = kout; + used = MP_USED(mp); + pd = MP_DIGITS(mp); + for (ix = 0; ix < used; ix++) { + mp_digit d = *pd; + *pd++ = (d << 1) | kin; + kin = (d >> (DIGIT_BIT - 1)); } /* Deal with rollover from last digit */ - if(kin) { - if(ix >= ALLOC(mp)) { + if (kin) { + if (ix >= ALLOC(mp)) { + mp_err res; if((res = s_mp_grow(mp, ALLOC(mp) + 1)) != MP_OKAY) return res; } @@ -3139,8 +3097,10 @@ mp_err s_mp_mod_d(mp_int *mp, mp_digit d, mp_digit *r) /* Compute a = |a| + |b| */ mp_err s_mp_add(mp_int *a, const mp_int *b) /* magnitude addition */ { - mp_word w, k = 0; + mp_digit *pa, *pb; + mp_word w = 0; mp_size ix; + mp_size used; mp_err res; /* Make sure a has enough precision for the output value */ @@ -3154,19 +3114,23 @@ mp_err s_mp_add(mp_int *a, const mp_int *b) /* magnitude addition */ less precision, we'll have to make sure the carry out is duly propagated upward among the higher-order digits of the sum. */ - for(ix = 0; ix < USED(b); ix++) { - w = (mp_word)DIGIT(a, ix) + DIGIT(b, ix) + k; - DIGIT(a, ix) = ACCUM(w); - k = CARRYOUT(w); + pa = MP_DIGITS(a); + pb = MP_DIGITS(b); + used = MP_USED(b); + for(ix = 0; ix < used; ix++) { + w = w + *pa + *pb++; + *pa++ = ACCUM(w); + w = CARRYOUT(w); } /* If we run out of 'b' digits before we're actually done, make sure the carries get propagated upward... */ - while(k && ix < USED(a)) { - w = (mp_word)DIGIT(a, ix) + k; - DIGIT(a, ix) = ACCUM(w); - k = CARRYOUT(w); + used = MP_USED(a); + while (w && ix < used) { + w = w + *pa; + *pa++ = ACCUM(w); + w = CARRYOUT(w); ++ix; } @@ -3174,11 +3138,11 @@ mp_err s_mp_add(mp_int *a, const mp_int *b) /* magnitude addition */ it. We could have done this initially, but why touch the memory allocator unless we're sure we have to? */ - if(k) { + if (w) { if((res = s_mp_pad(a, USED(a) + 1)) != MP_OKAY) return res; - DIGIT(a, ix) = (mp_digit)k; + DIGIT(a, ix) = (mp_digit)w; } return MP_OKAY; @@ -3187,6 +3151,69 @@ mp_err s_mp_add(mp_int *a, const mp_int *b) /* magnitude addition */ /* }}} */ +/* Compute c = |a| + |b| */ /* magnitude addition */ +mp_err s_mp_add_3arg(const mp_int *a, const mp_int *b, mp_int *c) +{ + mp_digit *pa, *pb, *pc; + mp_word w = 0; + mp_size ix; + mp_size used; + mp_err res; + + MP_SIGN(c) = MP_SIGN(a); + if (MP_USED(a) < MP_USED(b)) { + const mp_int *xch = a; + a = b; + b = xch; + } + + /* Make sure a has enough precision for the output value */ + if (MP_OKAY != (res = s_mp_pad(c, MP_USED(a)))) + return res; + + /* + Add up all digits up to the precision of b. If b had initially + the same precision as a, or greater, we took care of it by the + exchange step above, so there is no problem. If b had initially + less precision, we'll have to make sure the carry out is duly + propagated upward among the higher-order digits of the sum. + */ + pa = MP_DIGITS(a); + pb = MP_DIGITS(b); + pc = MP_DIGITS(c); + used = MP_USED(b); + for (ix = 0; ix < used; ix++) { + w = w + *pa++ + *pb++; + *pc++ = ACCUM(w); + w = CARRYOUT(w); + } + + /* If we run out of 'b' digits before we're actually done, make + sure the carries get propagated upward... + */ + used = MP_USED(a); + while (ix < used) { + w = w + *pa++; + *pc++ = ACCUM(w); + w = CARRYOUT(w); + ++ix; + } + + /* If there's an overall carry out, increase precision and include + it. We could have done this initially, but why touch the memory + allocator unless we're sure we have to? + */ + if (w) { + if((res = s_mp_pad(c, ix + 1)) != MP_OKAY) + return res; + + DIGIT(c, ix) = (mp_digit)w; + ++ix; + } + MP_USED(c) = ix; + return MP_OKAY; + +} /* {{{ s_mp_add_offset(a, b, offset) */ /* Compute a = |a| + ( |b| * (RADIX ** offset) ) */ @@ -3288,6 +3315,52 @@ mp_err s_mp_sub(mp_int *a, const mp_int *b) /* magnitude subtract */ /* }}} */ +/* Compute c = |a| - |b|, assumes |a| >= |b| */ /* magnitude subtract */ +mp_err s_mp_sub_3arg(const mp_int *a, const mp_int *b, mp_int *c) +{ + mp_digit *pa, *pb, *pc; + mp_sword w = 0; + int ix, limit; + mp_err res; + + MP_SIGN(c) = MP_SIGN(a); + + /* Make sure a has enough precision for the output value */ + if (MP_OKAY != (res = s_mp_pad(c, MP_USED(a)))) + return res; + + /* + Subtract and propagate borrow. Up to the precision of b, this + accounts for the digits of b; after that, we just make sure the + carries get to the right place. This saves having to pad b out to + the precision of a just to make the loops work right... + */ + pa = MP_DIGITS(a); + pb = MP_DIGITS(b); + pc = MP_DIGITS(c); + limit = MP_USED(b); + for (ix = 0; ix < limit; ++ix) { + w = w + *pa++ - *pb++; + *pc++ = ACCUM(w); + w >>= MP_DIGIT_BIT; + } + for (limit = MP_USED(a); ix < limit; ++ix) { + w = w + *pa++; + *pc++ = ACCUM(w); + w >>= MP_DIGIT_BIT; + } + + /* Clobber any leading zeroes we created */ + MP_USED(c) = ix; + s_mp_clamp(c); + + /* + If there was a borrow out, then |b| > |a| in violation + of our input invariant. We've already done the work, + but we'll at least complain about it... + */ + return w ? MP_RANGE : MP_OKAY; +} /* {{{ s_mp_mul(a, b) */ /* Compute a = |a| * |b| */ @@ -3298,24 +3371,55 @@ mp_err s_mp_mul(mp_int *a, const mp_int *b) /* }}} */ -/* c += a * b */ -void s_mp_mul_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) +#if !defined(MP_ASSEMBLY_MULTIPLY) +/* c = a * b */ +void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) { - mp_word w = 0; + mp_digit d = 0; /* Inner product: Digits of a */ while (a_len--) { - w += ((mp_word)b * *a++) + *c; + mp_word w = ((mp_word)b * *a++) + d; *c++ = ACCUM(w); - w = CARRYOUT(w); + d = CARRYOUT(w); + } + *c = d; +} + +/* c += a * b */ +void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, + mp_digit *c) +{ + mp_digit d = 0; + + /* Inner product: Digits of a */ + while (a_len--) { + mp_word w = ((mp_word)b * *a++) + *c + d; + *c++ = ACCUM(w); + d = CARRYOUT(w); + } + *c = d; +} + +/* c += a * b */ +void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) +{ + mp_digit d = 0; + + /* Inner product: Digits of a */ + while (a_len--) { + mp_word w = ((mp_word)b * *a++) + *c + d; + *c++ = ACCUM(w); + d = CARRYOUT(w); } - while (w) { - w += *c; + while (d) { + mp_word w = (mp_word)*c + d; *c++ = ACCUM(w); - w = CARRYOUT(w); + d = CARRYOUT(w); } } +#endif #if MP_SQUARE /* {{{ s_mp_sqr(a) */ diff --git a/security/nss/lib/freebl/mpi/mpi.h b/security/nss/lib/freebl/mpi/mpi.h index b92f8ffec23..5851afcc212 100644 --- a/security/nss/lib/freebl/mpi/mpi.h +++ b/security/nss/lib/freebl/mpi/mpi.h @@ -36,7 +36,7 @@ * may use your version of this file under either the MPL or the * GPL. * - * $Id: mpi.h,v 1.7 2000-08-04 19:57:24 nelsonb%netscape.com Exp $ + * $Id: mpi.h,v 1.8 2000-08-22 01:57:33 nelsonb%netscape.com Exp $ */ #ifndef _H_MPI_ @@ -68,7 +68,7 @@ #define MP_UNDEF -5 /* answer is undefined */ #define MP_LAST_CODE MP_UNDEF -typedef char mp_sign; +typedef unsigned int mp_sign; typedef unsigned int mp_size; typedef int mp_err; @@ -245,6 +245,9 @@ mp_err mp_to_unsigned_octets(const mp_int *mp, unsigned char *str, mp_size maxle mp_err mp_to_signed_octets(const mp_int *mp, unsigned char *str, mp_size maxlen); mp_err mp_to_fixlen_octets(const mp_int *mp, unsigned char *str, mp_size len); +#define MP_CHECKOK(x) if (MP_OKAY > (res = (x))) goto CLEANUP +#define MP_CHECKERR(x) if (MP_OKAY > (res = (x))) goto CLEANUP + #if defined(MP_API_COMPATIBLE) #define NEG MP_NEG #define ZPOS MP_ZPOS diff --git a/security/nss/lib/freebl/mpi/mpmontg.c b/security/nss/lib/freebl/mpi/mpmontg.c index 00dd74108b9..b14cbbe1101 100644 --- a/security/nss/lib/freebl/mpi/mpmontg.c +++ b/security/nss/lib/freebl/mpi/mpmontg.c @@ -29,7 +29,7 @@ * the GPL. If you do not delete the provisions above, a recipient * may use your version of this file under either the MPL or the * GPL. - * $Id: mpmontg.c,v 1.6 2000-08-08 03:20:35 nelsonb%netscape.com Exp $ + * $Id: mpmontg.c,v 1.7 2000-08-22 01:57:34 nelsonb%netscape.com Exp $ */ /* This file implements moduluar exponentiation using Montgomery's @@ -46,13 +46,11 @@ #include "mplogic.h" #include "mpprime.h" -#define MP_CHECKOK(x) if (MP_OKAY != (rv = (x))) goto loser -#define MP_CHECKERR(x) if (0 > (rv = (x))) goto loser #define STATIC /* #define DEBUG 1 */ -#define WINDOW_BITS 5 -#define ODD_INTS 16 /* 2 ** (WINDOW_BITS - 1) */ +#define MAX_WINDOW_BITS 6 +#define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */ typedef struct { mp_int N; /* modulus N */ @@ -60,17 +58,22 @@ typedef struct { mp_size b; /* R == 2 ** b, also b = # significant bits in N */ } mp_mont_modulus; +mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, + mp_mont_modulus *mmm); + /* computes T = REDC(T), 2^b == R */ STATIC mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm) { - mp_err rv; - int i; + mp_err res; + mp_size i; #ifdef DEBUG mp_int m; + MP_DIGITS(&m) = 0; #endif - MP_CHECKOK( s_mp_pad(T, MP_USED(T) + MP_USED(&mmm->N) + 2) ); + i = MP_USED(T) + MP_USED(&mmm->N) + 2; + MP_CHECKOK( s_mp_pad(T, i) ); for (i = 0; i < MP_USED(&mmm->N); ++i ) { mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime; /* T += N * m_i * (MP_RADIX ** i); */ @@ -84,39 +87,100 @@ mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm) MP_CHECKOK( mp_div_2d(T, mmm->b, T, &m)); /* here, remainder m should be equal to zero */ if (mp_cmp_z(&m) != 0) { - rv = MP_UNDEF; - goto loser; + res = MP_UNDEF; + goto CLEANUP; } #else s_mp_div_2d(T, mmm->b); #endif - if ((rv = s_mp_cmp(T, &mmm->N)) >= 0) { + if ((res = s_mp_cmp(T, &mmm->N)) >= 0) { /* T = T - N */ MP_CHECKOK( s_mp_sub(T, &mmm->N) ); #ifdef DEBUG - if ((rv = mp_cmp(T, &mmm->N)) >= 0) { - rv = MP_UNDEF; + if ((res = mp_cmp(T, &mmm->N)) >= 0) { + res = MP_UNDEF; + goto CLEANUP; } #endif } - rv = MP_OKAY; -loser: + res = MP_OKAY; +CLEANUP: #ifdef DEBUG mp_clear(&m); #endif - return rv; + return res; } -mp_err mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont) +#if !defined(MP_ASSEMBLY_MUL_MONT) && !defined(MP_MONT_USE_MP_MUL) +mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, + mp_mont_modulus *mmm) { - mp_err rv; + mp_digit *pb; + mp_digit m_i; + mp_err res; + mp_size ib; + mp_size useda, usedb; + + ARGCHK(a != NULL && b != NULL && c != NULL, MP_BADARG); + + if (MP_USED(a) < MP_USED(b)) { + const mp_int *xch = b; /* switch a and b, to do fewer outer loops */ + b = a; + a = xch; + } + + MP_USED(c) = 1; MP_DIGIT(c, 0) = 0; + ib = MP_USED(a) + MP_MAX(MP_USED(b), MP_USED(&mmm->N)) + 2; + if((res = s_mp_pad(c, ib)) != MP_OKAY) + goto CLEANUP; + + useda = MP_USED(a); + pb = MP_DIGITS(b); + s_mpv_mul_d(MP_DIGITS(a), useda, *pb++, MP_DIGITS(c)); + s_mp_setz(MP_DIGITS(c) + useda + 1, ib - (useda + 1)); + m_i = MP_DIGIT(c, 0) * mmm->n0prime; + s_mp_mul_d_add_offset(&mmm->N, m_i, c, 0); + + /* Outer loop: Digits of b */ + usedb = MP_USED(b); + for (ib = 1; ib < usedb; ib++) { + mp_digit b_i = *pb++; + + /* Inner product: Digits of a */ + if (b_i) + s_mpv_mul_d_add_prop(MP_DIGITS(a), useda, b_i, MP_DIGITS(c) + ib); + m_i = MP_DIGIT(c, ib) * mmm->n0prime; + s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); + } + if (usedb < MP_USED(&mmm->N)) { + for (usedb = MP_USED(&mmm->N); ib < usedb; ++ib ) { + m_i = MP_DIGIT(c, ib) * mmm->n0prime; + s_mp_mul_d_add_offset(&mmm->N, m_i, c, ib); + } + } + s_mp_clamp(c); + s_mp_div_2d(c, mmm->b); + if (s_mp_cmp(c, &mmm->N) >= 0) { + MP_CHECKOK( s_mp_sub(c, &mmm->N) ); + } + res = MP_OKAY; + +CLEANUP: + return res; +} +#endif + +STATIC +mp_err s_mp_to_mont(const mp_int *x, mp_mont_modulus *mmm, mp_int *xMont) +{ + mp_err res; /* xMont = x * R mod N where N is modulus */ MP_CHECKOK( mpl_lsh(x, xMont, mmm->b) ); /* xMont = x << b */ MP_CHECKOK( mp_div(xMont, &mmm->N, 0, xMont) ); /* mod N */ -loser: - return rv; +CLEANUP: + return res; } @@ -124,28 +188,34 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, const mp_int *modulus, mp_int *result) { const mp_int *base; + mp_int *pa1, *pa2, *ptmp; mp_size bits_in_exponent; mp_size i; - mp_err rv; - mp_int square, accum, goodBase, tmp; + mp_size window_bits, odd_ints; + mp_err res; + mp_int square, accum1, accum2, goodBase; mp_mont_modulus mmm; /* function for computing n0prime only works if n0 is odd */ if (!mp_isodd(modulus)) return s_mp_exptmod(inBase, exponent, modulus, result); + MP_DIGITS(&square) = 0; + MP_DIGITS(&accum1) = 0; + MP_DIGITS(&accum2) = 0; + MP_DIGITS(&goodBase) = 0; + if (mp_cmp(inBase, modulus) < 0) { base = inBase; - MP_DIGITS(&goodBase) = 0; } else { - mp_init(&goodBase); + MP_CHECKOK( mp_init(&goodBase) ); base = &goodBase; MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) ); } - mp_init_size(&square, 2 * MP_USED(modulus) + 2); - mp_init_size(&accum, 3 * MP_USED(modulus) + 2); - mp_init_size(&tmp, 3 * MP_USED(modulus) + 2); + MP_CHECKOK( mp_init_size(&square, 2 * MP_USED(modulus) + 2) ); + MP_CHECKOK( mp_init_size(&accum1, 3 * MP_USED(modulus) + 2) ); + MP_CHECKOK( mp_init_size(&accum2, 3 * MP_USED(modulus) + 2) ); mmm.N = *modulus; /* a copy of the mp_int struct */ i = mpl_significant_bits(modulus); @@ -157,18 +227,25 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, */ mmm.n0prime = 0 - s_mp_invmod_32b( MP_DIGIT(modulus, 0) ); - MP_CHECKOK( mp_to_mont(base, &mmm, &square) ); + MP_CHECKOK( s_mp_to_mont(base, &mmm, &square) ); bits_in_exponent = mpl_significant_bits(exponent); - i = bits_in_exponent % WINDOW_BITS; + if (bits_in_exponent > 480) + window_bits = 6; + else if (bits_in_exponent > 160) + window_bits = 5; + else + window_bits = 4; + odd_ints = 1 << (window_bits - 1); + i = bits_in_exponent % window_bits; if (i != 0) { - bits_in_exponent += WINDOW_BITS - i; + bits_in_exponent += window_bits - i; } { /* oddPowers[i] = base ** (2*i + 1); */ - /* power2 = base ** 2; */ int expOff; - mp_int power2, oddPowers[ODD_INTS]; + /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */ + mp_int power2, oddPowers[MAX_ODD_INTS]; MP_CHECKOK( mp_init_copy(oddPowers, &square) ); @@ -176,97 +253,122 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, MP_CHECKOK( mp_sqr(&square, &power2) ); /* square = square ** 2 */ MP_CHECKOK( s_mp_redc(&power2, &mmm) ); - for (i = 1; i < ODD_INTS; ++i) { + for (i = 1; i < odd_ints; ++i) { mp_init_size(oddPowers + i, MP_USED(modulus) + 2 * MP_USED(&power2) + 2); MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) ); MP_CHECKOK( s_mp_redc(oddPowers + i, &mmm) ); } - mp_set(&accum, 1); - MP_CHECKOK( mp_to_mont(&accum, &mmm, &accum) ); + mp_set(&accum1, 1); + MP_CHECKOK( s_mp_to_mont(&accum1, &mmm, &accum1) ); + pa1 = &accum1; + pa2 = &accum2; -#define SQUARE \ - MP_CHECKOK( mp_sqr(&accum, &tmp) );\ - mp_exch(&accum, &tmp); \ - MP_CHECKOK( s_mp_redc(&accum, &mmm) ) +#define SQR(a,b) \ + MP_CHECKOK( mp_sqr(a, b) );\ + MP_CHECKOK( s_mp_redc(b, &mmm) ) -#define MUL(x) \ - MP_CHECKOK( mp_mul(&accum, oddPowers + (x), &tmp) ); \ - mp_exch(&accum, &tmp); \ - MP_CHECKOK( s_mp_redc(&accum, &mmm)) - - for (expOff = bits_in_exponent - WINDOW_BITS; expOff >= 0; expOff -= WINDOW_BITS) { - mp_size smallExp; - MP_CHECKERR( mpl_get_bits(exponent, expOff, WINDOW_BITS) ); - smallExp = (mp_size)rv; - -#if WINDOW_BITS == 4 - if (!smallExp) { - SQUARE; SQUARE; SQUARE; SQUARE; - } else if (smallExp & 1) { - SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2); - } else if (smallExp & 2) { - SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE; - } else if (smallExp & 4) { - SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE; - } else if (smallExp & 8) { - SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE; - } else { - abort(); - } -#elif WINDOW_BITS == 5 - if (!smallExp) { - SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; - } else if (smallExp & 1) { - SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2); - } else if (smallExp & 2) { - SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE; - } else if (smallExp & 4) { - SQUARE; SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE; - } else if (smallExp & 8) { - SQUARE; SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE; - } else if (smallExp & 0x10) { - SQUARE; MUL(smallExp/32); SQUARE; SQUARE; SQUARE; SQUARE; - } else { - abort(); - } -#elif WINDOW_BITS == 6 - if (!smallExp) { - SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; - } else if (smallExp & 1) { - SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/2); - } else if (smallExp & 2) { - SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/4); SQUARE; - } else if (smallExp & 4) { - SQUARE; SQUARE; SQUARE; SQUARE; MUL(smallExp/8); SQUARE; SQUARE; - } else if (smallExp & 8) { - SQUARE; SQUARE; SQUARE; MUL(smallExp/16); SQUARE; SQUARE; SQUARE; - } else if (smallExp & 0x10) { - SQUARE; SQUARE; MUL(smallExp/32); SQUARE; SQUARE; SQUARE; SQUARE; - } else if (smallExp & 0x20) { - SQUARE; MUL(smallExp/64); SQUARE; SQUARE; SQUARE; SQUARE; SQUARE; - } else { - abort(); - } +#if defined(MP_MONT_USE_MP_MUL) +#define MUL(x,a,b) \ + MP_CHECKOK( mp_mul(a, oddPowers + (x), b) ); \ + MP_CHECKOK( s_mp_redc(b, &mmm) ) #else -#error "Unknown value for WINDOW_BITS" +#define MUL(x,a,b) \ + MP_CHECKOK( s_mp_mul_mont(a, oddPowers + (x), b, &mmm) ) #endif + +#define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp + + for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) { + mp_size smallExp; + MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) ); + smallExp = (mp_size)res; + + if (window_bits == 4) { + if (!smallExp) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + } else if (smallExp & 1) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + MUL(smallExp/2, pa1,pa2); SWAPPA; + } else if (smallExp & 2) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); + MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 4) { + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/8,pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 8) { + SQR(pa1,pa2); MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else { + abort(); + } + } else if (window_bits == 5) { + if (!smallExp) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 1) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); MUL(smallExp/2,pa2,pa1); + } else if (smallExp & 2) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + MUL(smallExp/4,pa1,pa2); SQR(pa2,pa1); + } else if (smallExp & 4) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); + MUL(smallExp/8,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + } else if (smallExp & 8) { + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/16,pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + } else if (smallExp & 0x10) { + SQR(pa1,pa2); MUL(smallExp/32,pa2,pa1); SQR(pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + } else { + abort(); + } + } else if (window_bits == 6) { + if (!smallExp) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); SQR(pa2,pa1); + } else if (smallExp & 1) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/2,pa1,pa2); SWAPPA; + } else if (smallExp & 2) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); MUL(smallExp/4,pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 4) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + MUL(smallExp/8,pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 8) { + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); + MUL(smallExp/16,pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 0x10) { + SQR(pa1,pa2); SQR(pa2,pa1); MUL(smallExp/32,pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 0x20) { + SQR(pa1,pa2); MUL(smallExp/64,pa2,pa1); SQR(pa1,pa2); + SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SWAPPA; + } else { + abort(); + } + } else { + abort(); + } } mp_clear(&power2); - for (i = 0; i < ODD_INTS; ++i) { + for (i = 0; i < odd_ints; ++i) { mp_clear(oddPowers + i); } } - rv = s_mp_redc(&accum, &mmm); - mp_exch(&accum, result); -loser: + res = s_mp_redc(pa1, &mmm); + mp_exch(pa1, result); +CLEANUP: mp_clear(&square); - mp_clear(&accum); + mp_clear(&accum1); + mp_clear(&accum2); mp_clear(&goodBase); - mp_clear(&tmp); /* Don't mp_clear mmm.N because it is merely a copy of modulus. ** Just zap it. */ memset(&mmm, 0, sizeof mmm); - return rv; + return res; }