/*
 * mpint_i.h: definitions used internally by the bignum code, and
 * also a few other vaguely-bignum-like places.
 */

/* ----------------------------------------------------------------------
 * The assorted conditional definitions of BignumInt and multiply
 * macros used throughout the bignum code to treat numbers as arrays
 * of the most conveniently sized word for the target machine.
 * Exported so that other code (e.g. poly1305) can use it too.
 *
 * This code must export, in whatever ifdef branch it ends up in:
 *
 *  - two types: 'BignumInt' and 'BignumCarry'. BignumInt is an
 *    unsigned integer type which will be used as the base word size
 *    for all bignum operations. BignumCarry is an unsigned integer
 *    type used to hold the carry flag taken as input and output by
 *    the BignumADC macro (see below).
 *
 *  - five constant macros:
 *     + BIGNUM_INT_BITS, the number of bits in BignumInt,
 *     + BIGNUM_INT_BYTES, the number of bytes that works out to
 *     + BIGNUM_TOP_BIT, the BignumInt value consisting of only the top bit
 *     + BIGNUM_INT_MASK, the BignumInt value with all bits set
 *     + BIGNUM_INT_BITS_BITS, log to the base 2 of BIGNUM_INT_BITS.
 *
 *  - four statement macros: BignumADC, BignumMUL, BignumMULADD,
 *    BignumMULADD2. These do various kinds of multi-word arithmetic,
 *    and all produce two output values.
 *     * BignumADC(ret,retc,a,b,c) takes input BignumInt values a,b
 *       and a BignumCarry c, and outputs a BignumInt ret = a+b+c and
 *       a BignumCarry retc which is the carry off the top of that
 *       addition.
 *     * BignumMUL(rh,rl,a,b) returns the two halves of the
 *       double-width product a*b.
 *     * BignumMULADD(rh,rl,a,b,addend) returns the two halves of the
 *       double-width value a*b + addend.
 *     * BignumMULADD2(rh,rl,a,b,addend1,addend2) returns the two
 *       halves of the double-width value a*b + addend1 + addend2.
 *
 * Every branch of the main ifdef below defines the type BignumInt and
 * the value BIGNUM_INT_BITS_BITS. The other constant macros are
 * filled in by common code further down.
 *
 * Most branches also define a macro DEFINE_BIGNUMDBLINT containing a
 * typedef statement which declares a type _twice_ the length of a
 * BignumInt. This causes the common code further down to produce a
 * default implementation of the four statement macros in terms of
 * that double-width type, and also to defined BignumCarry to be
 * BignumInt.
 *
 * However, if a particular compile target does not have a type twice
 * the length of the BignumInt you want to use but it does provide
 * some alternative means of doing add-with-carry and double-word
 * multiply, then the ifdef branch in question can just define
 * BignumCarry and the four statement macros itself, and that's fine
 * too.
 */

/* You can lower the BignumInt size by defining BIGNUM_OVERRIDE on the
 * command line to be your chosen max value of BIGNUM_INT_BITS_BITS */
#define BB_OK(b) (!defined BIGNUM_OVERRIDE || BIGNUM_OVERRIDE >= b)

#if defined __SIZEOF_INT128__ && BB_OK(6)

  /*
   * 64-bit BignumInt using gcc/clang style 128-bit BignumDblInt.
   *
   * gcc and clang both provide a __uint128_t type on 64-bit targets
   * (and, when they do, indicate its presence by the above macro),
   * using the same 'two machine registers' kind of code generation
   * that 32-bit targets use for 64-bit ints.
   */

  typedef unsigned long long BignumInt;
  #define BIGNUM_INT_BITS_BITS 6
  #define DEFINE_BIGNUMDBLINT typedef __uint128_t BignumDblInt

#elif defined _MSC_VER && defined _M_AMD64 && BB_OK(6)

  /*
   * 64-bit BignumInt, using Visual Studio x86-64 compiler intrinsics.
   *
   * 64-bit Visual Studio doesn't provide very much in the way of help
   * here: there's no int128 type, and also no inline assembler giving
   * us direct access to the x86-64 MUL or ADC instructions. However,
   * there are compiler intrinsics giving us that access, so we can
   * use those - though it turns out we have to be a little careful,
   * since they seem to generate wrong code if their pointer-typed
   * output parameters alias their inputs. Hence all the internal temp
   * variables inside the macros.
   */

  #include <intrin.h>
  typedef unsigned char BignumCarry; /* the type _addcarry_u64 likes to use */
  typedef unsigned __int64 BignumInt;
  #define BIGNUM_INT_BITS_BITS 6
  #define BignumADC(ret, retc, a, b, c) do                \
      {                                                   \
          BignumInt ADC_tmp;                              \
          (retc) = _addcarry_u64(c, a, b, &ADC_tmp);      \
          (ret) = ADC_tmp;                                \
      } while (0)
  #define BignumMUL(rh, rl, a, b) do              \
      {                                           \
          BignumInt MULADD_hi;                    \
          (rl) = _umul128(a, b, &MULADD_hi);      \
          (rh) = MULADD_hi;                       \
      } while (0)
  #define BignumMULADD(rh, rl, a, b, addend) do                           \
      {                                                                   \
          BignumInt MULADD_lo, MULADD_hi;                                 \
          MULADD_lo = _umul128(a, b, &MULADD_hi);                         \
          MULADD_hi += _addcarry_u64(0, MULADD_lo, (addend), &(rl));     \
          (rh) = MULADD_hi;                                               \
      } while (0)
  #define BignumMULADD2(rh, rl, a, b, addend1, addend2) do                \
      {                                                                   \
          BignumInt MULADD_lo1, MULADD_lo2, MULADD_hi;                    \
          MULADD_lo1 = _umul128(a, b, &MULADD_hi);                        \
          MULADD_hi += _addcarry_u64(0, MULADD_lo1, (addend1), &MULADD_lo2); \
          MULADD_hi += _addcarry_u64(0, MULADD_lo2, (addend2), &(rl));    \
          (rh) = MULADD_hi;                                               \
      } while (0)

#elif (defined __GNUC__ || defined _LLP64 || __STDC__ >= 199901L) && BB_OK(5)

  /* 32-bit BignumInt, using C99 unsigned long long as BignumDblInt */

  typedef unsigned int BignumInt;
  #define BIGNUM_INT_BITS_BITS 5
  #define DEFINE_BIGNUMDBLINT typedef unsigned long long BignumDblInt

#elif defined _MSC_VER && BB_OK(5)

  /* 32-bit BignumInt, using Visual Studio __int64 as BignumDblInt */

  typedef unsigned int BignumInt;
  #define BIGNUM_INT_BITS_BITS 5
  #define DEFINE_BIGNUMDBLINT typedef unsigned __int64 BignumDblInt

#elif defined _LP64 && BB_OK(5)

  /*
   * 32-bit BignumInt, using unsigned long itself as BignumDblInt.
   *
   * Only for platforms where long is 64 bits, of course.
   */

  typedef unsigned int BignumInt;
  #define BIGNUM_INT_BITS_BITS 5
  #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt

#elif BB_OK(4)

  /*
   * 16-bit BignumInt, using unsigned long as BignumDblInt.
   *
   * This is the final fallback for real emergencies: C89 guarantees
   * unsigned short/long to be at least the required sizes, so this
   * should work on any C implementation at all. But it'll be
   * noticeably slow, so if you find yourself in this case you
   * probably want to move heaven and earth to find an alternative!
   */

  typedef unsigned short BignumInt;
  #define BIGNUM_INT_BITS_BITS 4
  #define DEFINE_BIGNUMDBLINT typedef unsigned long BignumDblInt

#else

  /* Should only get here if BB_OK(4) evaluated false, i.e. the
   * command line defined BIGNUM_OVERRIDE to an absurdly small
   * value. */
  #error Must define BIGNUM_OVERRIDE to at least 4

#endif

#undef BB_OK

/*
 * Common code across all branches of that ifdef: define all the
 * easy constant macros in terms of BIGNUM_INT_BITS_BITS.
 */
#define BIGNUM_INT_BITS (1 << BIGNUM_INT_BITS_BITS)
#define BIGNUM_INT_BYTES (BIGNUM_INT_BITS / 8)
#define BIGNUM_TOP_BIT (((BignumInt)1) << (BIGNUM_INT_BITS-1))
#define BIGNUM_INT_MASK (BIGNUM_TOP_BIT | (BIGNUM_TOP_BIT-1))

/*
 * Just occasionally, we might need a GET_nnBIT_xSB_FIRST macro to
 * operate on whatever BignumInt is.
 */
#if BIGNUM_INT_BITS_BITS == 4
#define GET_BIGNUMINT_MSB_FIRST GET_16BIT_MSB_FIRST
#define GET_BIGNUMINT_LSB_FIRST GET_16BIT_LSB_FIRST
#define PUT_BIGNUMINT_MSB_FIRST PUT_16BIT_MSB_FIRST
#define PUT_BIGNUMINT_LSB_FIRST PUT_16BIT_LSB_FIRST
#elif BIGNUM_INT_BITS_BITS == 5
#define GET_BIGNUMINT_MSB_FIRST GET_32BIT_MSB_FIRST
#define GET_BIGNUMINT_LSB_FIRST GET_32BIT_LSB_FIRST
#define PUT_BIGNUMINT_MSB_FIRST PUT_32BIT_MSB_FIRST
#define PUT_BIGNUMINT_LSB_FIRST PUT_32BIT_LSB_FIRST
#elif BIGNUM_INT_BITS_BITS == 6
#define GET_BIGNUMINT_MSB_FIRST GET_64BIT_MSB_FIRST
#define GET_BIGNUMINT_LSB_FIRST GET_64BIT_LSB_FIRST
#define PUT_BIGNUMINT_MSB_FIRST PUT_64BIT_MSB_FIRST
#define PUT_BIGNUMINT_LSB_FIRST PUT_64BIT_LSB_FIRST
#else
  #error Ran out of options for GET_BIGNUMINT_xSB_FIRST
#endif

/*
 * Common code across _most_ branches of the ifdef: define a set of
 * statement macros in terms of the BignumDblInt type provided. In
 * this case, we also define BignumCarry to be the same thing as
 * BignumInt, for simplicity.
 */
#ifdef DEFINE_BIGNUMDBLINT

  typedef BignumInt BignumCarry;
  #define BignumADC(ret, retc, a, b, c) do                        \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt ADC_temp = (BignumInt)(a);                 \
          ADC_temp += (BignumInt)(b);                             \
          ADC_temp += (c);                                        \
          (ret) = (BignumInt)ADC_temp;                            \
          (retc) = (BignumCarry)(ADC_temp >> BIGNUM_INT_BITS);    \
      } while (0)
  
  #define BignumMUL(rh, rl, a, b) do                              \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)
  
  #define BignumMULADD(rh, rl, a, b, addend) do                   \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          MUL_temp += (BignumInt)(addend);                        \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)
  
  #define BignumMULADD2(rh, rl, a, b, addend1, addend2) do        \
      {                                                           \
          DEFINE_BIGNUMDBLINT;                                    \
          BignumDblInt MUL_temp = (BignumInt)(a);                 \
          MUL_temp *= (BignumInt)(b);                             \
          MUL_temp += (BignumInt)(addend1);                       \
          MUL_temp += (BignumInt)(addend2);                       \
          (rh) = (BignumInt)(MUL_temp >> BIGNUM_INT_BITS);        \
          (rl) = (BignumInt)(MUL_temp);                           \
      } while (0)

#endif /* DEFINE_BIGNUMDBLINT */

/* ----------------------------------------------------------------------
 * Data structures used inside bignum.c.
 */

struct mp_int {
    size_t nw;
    BignumInt *w;
};

struct MontyContext {
    /*
     * The actual modulus.
     */
    mp_int *m;

    /*
     * Montgomery multiplication works by selecting a value r > m,
     * coprime to m, which is really easy to divide by. In binary
     * arithmetic, that means making it a power of 2; in fact we make
     * it a whole number of BignumInt.
     *
     * We don't store r directly as an mp_int (there's no need). But
     * its value is 2^rbits; we also store rw = rbits/BIGNUM_INT_BITS
     * (the corresponding word offset within an mp_int).
     *
     * pw is the number of words needed to store an mp_int you're
     * doing reduction on: it has to be big enough to hold the sum of
     * an input value up to m^2 plus an extra addend up to m*r.
     */
    size_t rbits, rw, pw;

    /*
     * The key step in Montgomery reduction requires the inverse of -m
     * mod r.
     */
    mp_int *minus_minv_mod_r;

    /*
     * r^1, r^2 and r^3 mod m, which are used for various purposes.
     *
     * (Annoyingly, this is one of the rare cases where it would have
     * been nicer to have a Pascal-style 1-indexed array. I couldn't
     * _quite_ bring myself to put a gratuitous zero element in here.
     * So you just have to live with getting r^k by taking the [k-1]th
     * element of this array.)
     */
    mp_int *powers_of_r_mod_m[3];

    /*
     * Persistent scratch space from which monty_* functions can
     * allocate storage for intermediate values.
     */
    mp_int *scratch;
};