diff --git a/security/nss/lib/freebl/mpi/Makefile b/security/nss/lib/freebl/mpi/Makefile index 98f1c007e93..b747a8e122a 100644 --- a/security/nss/lib/freebl/mpi/Makefile +++ b/security/nss/lib/freebl/mpi/Makefile @@ -41,7 +41,7 @@ # ***** END LICENSE BLOCK ***** # -# $Id: Makefile,v 1.21 2005-02-02 22:28:22 gerv%gerv.net Exp $ +# $Id: Makefile,v 1.22 2005-11-22 07:16:43 relyea%netscape.com Exp $ # ## Define CC to be the C compiler you wish to use. The GNU cc @@ -89,7 +89,8 @@ VERS=1.7p6 ## ## This is the list of source files that need to be packed into ## the distribution file -SRCS= mpi.c mpprime.c mplogic.c mp_gf2m.c mpmontg.c mpi-test.c primes.c tests/ \ +SRCS= mpi.c mpprime.c mplogic.c mp_gf2m.c mpmontg.c mpi-test.c primes.c \ + mpcpucache.c tests/ \ utils/gcd.c utils/invmod.c utils/lap.c \ utils/ptab.pl utils/sieve.c utils/isprime.c\ utils/dec2hex.c utils/hex2dec.c utils/bbs_rand.c \ diff --git a/security/nss/lib/freebl/mpi/mpi-priv.h b/security/nss/lib/freebl/mpi/mpi-priv.h index b9fe8c18b64..b41d05b1217 100644 --- a/security/nss/lib/freebl/mpi/mpi-priv.h +++ b/security/nss/lib/freebl/mpi/mpi-priv.h @@ -42,7 +42,7 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ -/* $Id: mpi-priv.h,v 1.19 2005-08-16 19:25:48 saul.edwards%sun.com Exp $ */ +/* $Id: mpi-priv.h,v 1.20 2005-11-22 07:16:43 relyea%netscape.com Exp $ */ #ifndef _MPI_PRIV_H_ #define _MPI_PRIV_H_ 1 @@ -300,6 +300,19 @@ mp_err s_mp_mul_mont(const mp_int *a, const mp_int *b, mp_int *c, mp_mont_modulus *mmm); mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm); +/* + * s_mpi_getProcessorLineSize() returns the size in bytes of the cache line + * if a cache exists, or zero if there is no cache. If more than one + * cache line exists, it should return the smallest line size (which is + * usually the L1 cache). + * + * mp_modexp uses this information to make sure that private key information + * isn't being leaked through the cache. + * + * see mpcpucache.c for the implementation. + */ +unsigned long s_mpi_getProcessorLineSize(); + /* }}} */ #endif diff --git a/security/nss/lib/freebl/mpi/mpmontg.c b/security/nss/lib/freebl/mpi/mpmontg.c index b1ef102c1a0..d510b29d99d 100644 --- a/security/nss/lib/freebl/mpi/mpmontg.c +++ b/security/nss/lib/freebl/mpi/mpmontg.c @@ -36,7 +36,7 @@ * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ -/* $Id: mpmontg.c,v 1.15 2004-04-27 23:04:36 gerv%gerv.net Exp $ */ +/* $Id: mpmontg.c,v 1.16 2005-11-22 07:16:43 relyea%netscape.com Exp $ */ /* This file implements moduluar exponentiation using Montgomery's * method for modular reduction. This file implements the method @@ -47,7 +47,7 @@ * published by Springer Verlag. */ -/* #define MP_USING_MONT_MULF 1 */ +#define MP_USING_CACHE_SAFE_MOD_EXP 1 #include #include "mpi-priv.h" #include "mplogic.h" @@ -55,11 +55,19 @@ #ifdef MP_USING_MONT_MULF #include "montmulf.h" #endif +#include /* ptrdiff_t */ + +/* if MP_CHAR_STORE_SLOW is defined, we */ +/* need to know endianness of this platform. */ +#ifdef MP_CHAR_STORE_SLOW +#if !defined(MPI_IS_BIG_ENDIAN) && !defined(MPI_IS_LITTLE_ENDIAN) +#error "You must define MPI_IS_BIG_ENDIAN or MPI_IS_LITTLE_ENDIAN\n" \ + " if you define MP_CHAR_STORE_SLOW." +#endif +#endif #define STATIC -/* #define DEBUG 1 */ -#define MAX_WINDOW_BITS 6 #define MAX_ODD_INTS 32 /* 2 ** (WINDOW_BITS - 1) */ #if defined(_WIN32_WCE) @@ -174,6 +182,13 @@ CLEANUP: #ifdef MP_USING_MONT_MULF +/* the floating point multiply is already cache safe, + * don't turn on cache safe unless we specifically + * force it */ +#ifndef MP_FORCE_CACHE_SAFE +#undef MP_USING_CACHE_SAFE_MOD_EXP +#endif + unsigned int mp_using_mont_mulf = 1; /* computes montgomery square of the integer in mResult */ @@ -504,6 +519,564 @@ CLEANUP: #undef SQR #undef MUL +#ifdef MP_USING_CACHE_SAFE_MOD_EXP +unsigned int mp_using_cache_safe_exp = 1; +#endif + +mp_err mp_set_safe_modexp(int value) +{ +#ifdef MP_USING_CACHE_SAFE_MOD_EXP + mp_using_cache_safe_exp = value; + return MP_OKAY; +#else + if (value == 0) { + return MP_OKAY; + } + return MP_BADARG; +#endif +} + +#ifdef MP_USING_CACHE_SAFE_MOD_EXP +#define WEAVE_WORD_SIZE 4 + +#ifndef MP_CHAR_STORE_SLOW +/* + * mpi_to_weave takes MPI data and stores in into a byte array interleaved. + * + * The purpose of this interleaving is to hide our access to the array of + * modulus powers from and attacker snooping on cache hits and misses. Because + * the array is interleaved, each reference will cause exactly the same cache + * lines to reload. + * + * There are 2 different implementations in this file, one which works with just + * byte loads and stores, the second which works with mp_weave_word loads and + * stores. These 2 implementations have DIFFERENT results in exactly which byte + * of an mp_digit winds up in which location in the byte array. That is why + * there are 2 sets of explanations for how the array is set up. + * + * + * a is an array of WEAVE_WORD_SIZE mp_ints (that is 4). + * It is a partial window into a logical array mp_int p[count] containing + * the base to the 0 through count-1 powers. Ideally this code would be + * simpler if we stored one element of that array at a time, but on some + * platforms the cost of storing a byte incurs a full read modify write cycle + * and increases the memory bandwidth cost by a factor of 4 or 8. By collecting + * for mp_ints together, we can arrange to store all 4 values in a single + * word write. + * + * b is the targeted weaved location. b[0] points to the first byte where + * first byte of the a array needs to be stored. Each b is an offset into the + * weave array. + * + * count is 2^window size. + * + * b_size is the size in mp_digits of each mp_int in the array. mp_ints + * with less than b_size elements are logically padded with zeros before + * storing. + * + * + * Data is stored as follows : + * The mp_int array is treated as a byte array. + * + * + * we want to eventually store the logical array mp_int p[count] into the + * weave array as follows: + + * p[count].digit is treated as a byte array (rather than * an mp_digit array), + * N is count, and n is b_size * *sizeof(mp_digit): + * + * p[0].digit[0] p[1].digit[0] ...... p[N-2].digit[0] p[N-1].digit[0] + * p[0].digit[1] p[1].digit[1] ...... p[N-2].digit[1] p[N-1].digit[1] + * . . + * . . + * p[0].digit[n-2] p[1].digit[n-2] ...... p[N-2].digit[n-2] p[N-1].digit[n-2] + * p[0].digit[n-1] p[1].digit[n-1] ...... p[N-2].digit[n-1] p[N-1].digit[n-1] + * + * This function stores that a window of p in each call. + */ +mp_err mpi_to_weave(const mp_int *a, unsigned char *b, + mp_size b_size, mp_size count) +{ + mp_size i, j; + unsigned char *bsave = b; + + for (i=0; i < WEAVE_WORD_SIZE; i++) { + unsigned char *pb = (unsigned char *)MP_DIGITS(&a[i]); + mp_size useda = MP_USED(&a[i]); + mp_size zero = b_size - useda; + unsigned char *end = pb+ (useda*sizeof(mp_digit)); + b = bsave+i; + + + ARGCHK(MP_SIGN(&a[i]) == MP_ZPOS, MP_BADARG); + ARGCHK(useda <= b_size, MP_BADARG); + + for (; pb < end; pb++) { + *b = *pb; + b += count; + } + for (j=0; j < zero; j++) { + *b = 0; + b += count; + } + } + + return MP_OKAY; +} + +/* reverse the operation above for one entry. + * b points to the offset into the weave array of the power we are + * calculating */ +mp_err weave_to_mpi(mp_int *a, const unsigned char *b, + mp_size b_size, mp_size count) +{ + unsigned char *pb = (unsigned char *)MP_DIGITS(a); + unsigned char *end = pb+ (b_size*sizeof(mp_digit)); + + MP_SIGN(a) = MP_ZPOS; + MP_USED(a) = b_size; + + for (; pb < end; b+=count, pb++) { + *pb = *b; + } + s_mp_clamp(a); + return MP_OKAY; +} +#else +/* Need a primitive that we know is 32 bits long... */ +/* this is true on all modern processors we know of today*/ +typedef unsigned int mp_weave_word; + +/* + * on some platforms character stores into memory is very expensive since they + * generate a read/modify/write operation on the bus. On those platforms + * we need to do integer writes to the bus. Because of some unrolled code, + * in this current code the size of mp_weave_word must be four. The code that + * makes this assumption explicity is called out. (on some platforms a write + * of 4 bytes still requires a single read-modify-write operation. + * + * This function is takes the identical parameters as the function above, + * however it lays out the final array differently. Where the previous function + * treats the mpi_int as an byte array, this function treats it as an array of + * mp_digits where each digit is stored in big endian order. + * + * since we need to interleave on a byte by byte basis, we need to collect + * several mpi structures together into a single uint32 before we write. We + * also need to make sure the uint32 is arranged so that the first value of + * the first array winds up in b[0]. This means construction of that uint32 + * is endian specific (even though the layout of the mp_digits in the array + * is always big endian). + * + * The final data is stored as follows : + * + * Our same logical array p array, m is sizeof(mp_digit), + * N is still count and n is now b_size. If we define p[i].digit[j]0 as the + * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as + * the next most significant byte of p[i].digit[j], ... and p[i].digit[j]m-1 + * is the least significant byte. + * Our array would look like: + * p[0].digit[0]0 p[1].digit[0]0 ... p[N-2].digit[0]0 p[N-1].digit[0]0 + * p[0].digit[0]1 p[1].digit[0]1 ... p[N-2].digit[0]1 p[N-1].digit[0]1 + * . . + * p[0].digit[0]m-1 p[1].digit[0]m-1 ... p[N-2].digit[0]m-1 p[N-1].digit[0]m-1 + * p[0].digit[1]0 p[1].digit[1]0 ... p[N-2].digit[1]0 p[N-1].digit[1]0 + * . . + * . . + * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2 + * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1 + * + */ +mp_err mpi_to_weave(const mp_int *a, unsigned char *b, + mp_size b_size, mp_size count) +{ + mp_size i; + mp_digit *digitsa0; + mp_digit *digitsa1; + mp_digit *digitsa2; + mp_digit *digitsa3; + mp_size useda0; + mp_size useda1; + mp_size useda2; + mp_size useda3; + mp_weave_word *weaved = (mp_weave_word *)b; + + count = count/sizeof(mp_weave_word); + + /* this code pretty much depends on this ! */ +#if MP_ARGCHK < 2 + assert(WEAVE_WORD_SIZE == 4); + assert(sizeof(mp_weave_word) == 4); +#endif + + digitsa0 = MP_DIGITS(&a[0]); + digitsa1 = MP_DIGITS(&a[1]); + digitsa2 = MP_DIGITS(&a[2]); + digitsa3 = MP_DIGITS(&a[3]); + useda0 = MP_USED(&a[0]); + useda1 = MP_USED(&a[1]); + useda2 = MP_USED(&a[2]); + useda3 = MP_USED(&a[3]); + + ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG); + ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG); + ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG); + ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG); + ARGCHK(useda0 <= b_size, MP_BADARG); + ARGCHK(useda1 <= b_size, MP_BADARG); + ARGCHK(useda2 <= b_size, MP_BADARG); + ARGCHK(useda3 <= b_size, MP_BADARG); + +#define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0) + + for (i=0; i < b_size; i++) { + mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i); + mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i); + mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i); + mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i); + register mp_weave_word acc; + +/* + * ONE_STEP takes the MSB of each of our current digits and places that + * byte in the appropriate position for writing to the weaved array. + * On little endian: + * b3 b2 b1 b0 + * On big endian: + * b0 b1 b2 b3 + * When the data is written it would always wind up: + * b[0] = b0 + * b[1] = b1 + * b[2] = b2 + * b[3] = b3 + * + * Once we've written the MSB, we shift the whole digit up left one + * byte, putting the Next Most Significant Byte in the MSB position, + * so we we repeat the next one step that byte will be written. + * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE + * is 4. + */ +#ifdef IS_LITTLE_ENDIAN +#define MPI_WEAVE_ONE_STEP \ + acc = (d0 >> (MP_DIGIT_BITS-8)) & 0x000000ff; d0 <<= 8; /*b0*/ \ + acc |= (d1 >> (MP_DIGIT_BITS-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \ + acc |= (d2 >> (MP_DIGIT_BITS-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \ + acc |= (d3 >> (MP_DIGIT_BITS-32)) & 0xff000000; d3 <<= 8; /*b3*/ \ + *weaved = acc; weaved += count; +#else +#define MPI_WEAVE_ONE_STEP \ + acc = (d0 >> (MP_DIGIT_BITS-32)) & 0xff000000; d0 <<= 8; /*b0*/ \ + acc |= (d1 >> (MP_DIGIT_BITS-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \ + acc |= (d2 >> (MP_DIGIT_BITS-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \ + acc |= (d3 >> (MP_DIGIT_BITS-8)) & 0x000000ff; d3 <<= 8; /*b3*/ \ + *weaved = acc; weaved += count; +#endif + switch (sizeof(mp_digit)) { + case 32: + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + case 16: + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + case 8: + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + case 4: + MPI_WEAVE_ONE_STEP + MPI_WEAVE_ONE_STEP + case 2: + MPI_WEAVE_ONE_STEP + case 1: + MPI_WEAVE_ONE_STEP + break; + } + } + + return MP_OKAY; +} + +/* reverse the operation above for one entry. + * b points to the offset into the weave array of the power we are + * calculating */ +mp_err weave_to_mpi(mp_int *a, const unsigned char *b, + mp_size b_size, mp_size count) +{ + mp_digit *pb = MP_DIGITS(a); + mp_digit *end = &pb[b_size]; + + MP_SIGN(a) = MP_ZPOS; + MP_USED(a) = b_size; + + for (; pb < end; pb++) { + register mp_digit digit; + + digit = *b << 8; b += count; +#define MPI_UNWEAVE_ONE_STEP digit |= *b; b += count; digit = digit << 8; + switch (sizeof(mp_digit)) { + case 32: + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + case 16: + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + case 8: + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + case 4: + MPI_UNWEAVE_ONE_STEP + MPI_UNWEAVE_ONE_STEP + case 2: + break; + } + digit |= *b; b += count; + + *pb = digit; + } + s_mp_clamp(a); + return MP_OKAY; +} +#endif + + +#define SQR(a,b) \ + MP_CHECKOK( mp_sqr(a, b) );\ + MP_CHECKOK( s_mp_redc(b, mmm) ) + +#if defined(MP_MONT_USE_MP_MUL) +#define MUL_NOWEAVE(x,a,b) \ + MP_CHECKOK( mp_mul(a, x, b) ); \ + MP_CHECKOK( s_mp_redc(b, mmm) ) +#else +#define MUL_NOWEAVE(x,a,b) \ + MP_CHECKOK( s_mp_mul_mont(a, x, b, mmm) ) +#endif + +#define MUL(x,a,b) \ + MP_CHECKOK( weave_to_mpi(&tmp, powers + (x), nLen, num_powers) ); \ + MUL_NOWEAVE(&tmp,a,b) + +#define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp +#define MP_ALIGN(x,y) ((((ptrdiff_t)(x))+((y)-1))&(~((y)-1))) + +/* Do modular exponentiation using integer multiply code. */ +mp_err mp_exptmod_safe_i(const mp_int * montBase, + const mp_int * exponent, + const mp_int * modulus, + mp_int * result, + mp_mont_modulus *mmm, + int nLen, + mp_size bits_in_exponent, + mp_size window_bits, + mp_size num_powers) +{ + mp_int *pa1, *pa2, *ptmp; + mp_size i; + mp_size first_window; + mp_err res; + int expOff; + mp_int accum1, accum2, accum[WEAVE_WORD_SIZE]; + mp_int tmp; + unsigned char *powersArray; + unsigned char *powers; + + powersArray = (unsigned char *)malloc(num_powers*(nLen*sizeof(mp_digit)+1)); + if (powersArray == NULL) { + res = MP_MEM; + goto CLEANUP; + } + + /* powers[i] = base ** (i); */ + powers = (unsigned char *)MP_ALIGN(powersArray,num_powers); + + MP_DIGITS(&accum1) = 0; + MP_DIGITS(&accum2) = 0; + MP_DIGITS(&accum[0]) = 0; + MP_DIGITS(&accum[1]) = 0; + MP_DIGITS(&accum[2]) = 0; + MP_DIGITS(&accum[3]) = 0; + + /* grab the first window value. This allows us to preload accumulator1 + * and save a conversion, some squares and a multiple*/ + MP_CHECKOK( mpl_get_bits(exponent, + bits_in_exponent-window_bits, window_bits) ); + first_window = (mp_size)res; + + MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) ); + MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) ); + MP_DIGITS(&tmp) = 0; + MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) ); + + /* build the first WEAVE_WORD powers inline */ + /* if WEAVE_WORD_SIZE is not 4, this code will have to change */ + if (num_powers > 2) { + MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) ); + MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) ); + MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) ); + MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) ); + mp_set(&accum[0], 1); + MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) ); + MP_CHECKOK( mp_copy(montBase, &accum[1]) ); + SQR(montBase, &accum[2]); + MUL_NOWEAVE(montBase, &accum[2], &accum[3]); + MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) ); + if (first_window < 4) { + MP_CHECKOK( mp_copy(&accum[first_window], &accum1) ); + first_window = num_powers; + } + } else { + if (first_window == 0) { + mp_set(&accum1, 1); + MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) ); + } else { + /* assert first_window == 1? */ + MP_CHECKOK( mp_copy(montBase, &accum1) ); + } + } + + /* + * calculate all the powers in the powers array. + * this adds 2**(k-1)-2 square operations over just calculating the + * odd powers where k is the window size in the two other mp_modexpt + * implementations in this file. We will get some of that + * back by not needing the first 'k' squares and one multiply for the + * first window */ + for (i = WEAVE_WORD_SIZE; i < num_powers; i++) { + int acc_index = i & (WEAVE_WORD_SIZE-1); /* i % WEAVE_WORD_SIZE */ + if ( i & 1 ) { + MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]); + /* we've filled the array do our 'per array' processing */ + if (acc_index == (WEAVE_WORD_SIZE-1)) { + MP_CHECKOK( mpi_to_weave(accum, powers + i - (WEAVE_WORD_SIZE-1), + nLen, num_powers) ); + + if (first_window <= i) { + MP_CHECKOK( mp_copy(&accum[first_window & (WEAVE_WORD_SIZE-1)], + &accum1) ); + first_window = num_powers; + } + } + } else { + /* up to 8 we can find 2^i-1 in the accum array, but at 8 we our source + * and target are the same so we need to copy.. After that, the + * value is overwritten, so we need to fetch it from the stored + * weave array */ + if (i > 2* WEAVE_WORD_SIZE) { + MP_CHECKOK(weave_to_mpi(&accum2, powers+i/2, nLen, num_powers)); + SQR(&accum2, &accum[acc_index]); + } else { + int half_power_index = (i/2) & (WEAVE_WORD_SIZE-1); + if (half_power_index == acc_index) { + /* copy is cheaper than weave_to_mpi */ + MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2)); + SQR(&accum2,&accum[acc_index]); + } else { + SQR(&accum[half_power_index],&accum[acc_index]); + } + } + } + } + /* if the accum1 isn't set, Then there is something wrong with our logic + * above and is an internal programming error. + */ +#if MP_ARGCHK == 2 + assert(MP_USED(&accum1) != 0); +#endif + + /* set accumulator to montgomery residue of 1 */ + pa1 = &accum1; + pa2 = &accum2; + + for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) { + mp_size smallExp; + MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) ); + smallExp = (mp_size)res; + + /* handle unroll the loops */ + switch (window_bits) { + case 1: + if (!smallExp) { + SQR(pa1,pa2); SWAPPA; + } else if (smallExp & 1) { + SQR(pa1,pa2); MUL_NOWEAVE(montBase,pa2,pa1); + } else { + ABORT; + } + break; + case 6: + SQR(pa1,pa2); SQR(pa2,pa1); + /* fall through */ + case 4: + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + MUL(smallExp, pa1,pa2); SWAPPA; + break; + case 5: + SQR(pa1,pa2); SQR(pa2,pa1); SQR(pa1,pa2); SQR(pa2,pa1); + SQR(pa1,pa2); MUL(smallExp,pa2,pa1); + break; + default: + ABORT; /* could do a loop? */ + } + } + + res = s_mp_redc(pa1, mmm); + mp_exch(pa1, result); + +CLEANUP: + mp_clear(&accum1); + mp_clear(&accum2); + mp_clear(&accum[0]); + mp_clear(&accum[1]); + mp_clear(&accum[2]); + mp_clear(&accum[3]); + /* PORT_Memset(powers,0,num_powers*nLen*sizeof(mp_digit)); */ + free(powersArray); + return res; +} +#undef SQR +#undef MUL +#endif mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, const mp_int *modulus, mp_int *result) @@ -514,6 +1087,9 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, int nLen; mp_int montBase, goodBase; mp_mont_modulus mmm; +#ifdef MP_USING_CACHE_SAFE_MOD_EXP + static unsigned int max_window_bits; +#endif /* function for computing n0prime only works if n0 is odd */ if (!mp_isodd(modulus)) @@ -546,6 +1122,21 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, MP_CHECKOK( s_mp_to_mont(base, &mmm, &montBase) ); bits_in_exponent = mpl_significant_bits(exponent); +#ifdef MP_USING_CACHE_SAFE_MOD_EXP + if (mp_using_cache_safe_exp) { + if (bits_in_exponent > 780) + window_bits = 6; + else if (bits_in_exponent > 256) + window_bits = 5; + else if (bits_in_exponent > 20) + window_bits = 4; + /* RSA public key exponents are typically under 20 bits (common values + * are: 3, 17, 65537) and a 4-bit window is inefficient + */ + else + window_bits = 1; + } else +#endif if (bits_in_exponent > 480) window_bits = 6; else if (bits_in_exponent > 160) @@ -557,6 +1148,35 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, */ else window_bits = 1; + +#ifdef MP_USING_CACHE_SAFE_MOD_EXP + /* + * clamp the window size based on + * the cache line size. + */ + if (!max_window_bits) { + unsigned long cache_size = s_mpi_getProcessorLineSize(); + /* processor has no cache, use 'fast' code always */ + if (cache_size == 0) { + mp_using_cache_safe_exp = 0; + } + if ((cache_size == 0) || (cache_size >= 64)) { + max_window_bits = 6; + } else if (cache_size >= 32) { + max_window_bits = 5; + } else if (cache_size >= 16) { + max_window_bits = 4; + } else max_window_bits = 1; /* should this be an assert? */ + } + + /* clamp the window size down before we caclulate bits_in_exponent */ + if (mp_using_cache_safe_exp) { + if (window_bits > max_window_bits) { + window_bits = max_window_bits; + } + } +#endif + odd_ints = 1 << (window_bits - 1); i = bits_in_exponent % window_bits; if (i != 0) { @@ -569,6 +1189,12 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen, bits_in_exponent, window_bits, odd_ints); } else +#endif +#ifdef MP_USING_CACHE_SAFE_MOD_EXP + if (mp_using_cache_safe_exp) { + res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen, + bits_in_exponent, window_bits, 1 << window_bits); + } else #endif res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen, bits_in_exponent, window_bits, odd_ints); diff --git a/security/nss/lib/freebl/mpi/target.mk b/security/nss/lib/freebl/mpi/target.mk index 0edaf88139c..c178548195c 100644 --- a/security/nss/lib/freebl/mpi/target.mk +++ b/security/nss/lib/freebl/mpi/target.mk @@ -206,7 +206,7 @@ ifeq ($(TARGET),x86LINUX) #Linux AS_OBJS = mpi_x86.o MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE -DMP_ASSEMBLY_DIV_2DX1D -MPICMN += -DMP_MONT_USE_MP_MUL +MPICMN += -DMP_MONT_USE_MP_MUL -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \ -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \ -DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN) @@ -222,6 +222,7 @@ ifeq ($(TARGET),AMD64SOLARIS) ASFLAGS += -xarch=generic64 AS_OBJS = mpi_amd64.o mpi_amd64_sun.o MP_CONFIG = -DMP_ASSEMBLY_MULTIPLY -DMPI_AMD64 +MP_CONFIG += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN CFLAGS = -xarch=generic64 -xO4 -I. -DMP_API_COMPATIBLE -DMP_IOFUNC $(MP_CONFIG) MPICMN += $(MP_CONFIG)