gecko-dev/security/nss/lib/freebl/rijndael.c

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifdef FREEBL_NO_DEPEND
#include "stubs.h"
#endif

#include "prinit.h"
#include "prenv.h"
#include "prerr.h"
#include "secerr.h"

#include "prtypes.h"
#include "blapi.h"
#include "rijndael.h"

#include "cts.h"
#include "ctr.h"
#include "gcm.h"
#include "mpi.h"

#ifdef USE_HW_AES
#include "intel-aes.h"
#endif
#ifdef INTEL_GCM
#include "intel-gcm.h"
#endif /* INTEL_GCM */

/* Forward declarations */
void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
                                   unsigned int Nk);
void rijndael_native_encryptBlock(AESContext *cx,
                                  unsigned char *output,
                                  const unsigned char *input);

/* Stub definitions for the above rijndael_native_* functions, which
 * shouldn't be used unless NSS_X86_OR_X64 is defined */
#ifndef NSS_X86_OR_X64
void
rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
                              unsigned int Nk)
{
    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
    PORT_Assert(0);
}

void
rijndael_native_encryptBlock(AESContext *cx,
                             unsigned char *output,
                             const unsigned char *input)
{
    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
    PORT_Assert(0);
}
#endif /* NSS_X86_OR_X64 */

/*
 * There are currently three ways to build this code, varying in performance
 * and code size.
 *
 * RIJNDAEL_INCLUDE_TABLES         Include all tables from rijndael32.tab
 * RIJNDAEL_GENERATE_VALUES        Do not store tables, generate the table
 *                                 values "on-the-fly", using gfm
 * RIJNDAEL_GENERATE_VALUES_MACRO  Same as above, but use macros
 *
 * The default is RIJNDAEL_INCLUDE_TABLES.
 */

/*
 * When building RIJNDAEL_INCLUDE_TABLES, includes S**-1, Rcon, T[0..4],
 *                                                 T**-1[0..4], IMXC[0..4]
 * When building anything else, includes S, S**-1, Rcon
 */
#include "rijndael32.tab"

#if defined(RIJNDAEL_INCLUDE_TABLES)
/*
 * RIJNDAEL_INCLUDE_TABLES
 */
#define T0(i) _T0[i]
#define T1(i) _T1[i]
#define T2(i) _T2[i]
#define T3(i) _T3[i]
#define TInv0(i) _TInv0[i]
#define TInv1(i) _TInv1[i]
#define TInv2(i) _TInv2[i]
#define TInv3(i) _TInv3[i]
#define IMXC0(b) _IMXC0[b]
#define IMXC1(b) _IMXC1[b]
#define IMXC2(b) _IMXC2[b]
#define IMXC3(b) _IMXC3[b]
/* The S-box can be recovered from the T-tables */
#ifdef IS_LITTLE_ENDIAN
#define SBOX(b) ((PRUint8)_T3[b])
#else
#define SBOX(b) ((PRUint8)_T1[b])
#endif
#define SINV(b) (_SInv[b])

#else /* not RIJNDAEL_INCLUDE_TABLES */

/*
 * Code for generating T-table values.
 */

#ifdef IS_LITTLE_ENDIAN
#define WORD4(b0, b1, b2, b3) \
    ((((PRUint32)b3) << 24) | \
     (((PRUint32)b2) << 16) | \
     (((PRUint32)b1) << 8) |  \
     ((PRUint32)b0))
#else
#define WORD4(b0, b1, b2, b3) \
    ((((PRUint32)b0) << 24) | \
     (((PRUint32)b1) << 16) | \
     (((PRUint32)b2) << 8) |  \
     ((PRUint32)b3))
#endif

/*
 * Define the S and S**-1 tables (both have been stored)
 */
#define SBOX(b) (_S[b])
#define SINV(b) (_SInv[b])

/*
 * The function xtime, used for Galois field multiplication
 */
#define XTIME(a) \
    ((a & 0x80) ? ((a << 1) ^ 0x1b) : (a << 1))

/* Choose GFM method (macros or function) */
#if defined(RIJNDAEL_GENERATE_VALUES_MACRO)

/*
 * Galois field GF(2**8) multipliers, in macro form
 */
#define GFM01(a) \
    (a) /* a * 01 = a, the identity */
#define GFM02(a) \
    (XTIME(a) & 0xff) /* a * 02 = xtime(a) */
#define GFM04(a) \
    (GFM02(GFM02(a))) /* a * 04 = xtime**2(a) */
#define GFM08(a) \
    (GFM02(GFM04(a))) /* a * 08 = xtime**3(a) */
#define GFM03(a) \
    (GFM01(a) ^ GFM02(a)) /* a * 03 = a * (01 + 02) */
#define GFM09(a) \
    (GFM01(a) ^ GFM08(a)) /* a * 09 = a * (01 + 08) */
#define GFM0B(a) \
    (GFM01(a) ^ GFM02(a) ^ GFM08(a)) /* a * 0B = a * (01 + 02 + 08) */
#define GFM0D(a) \
    (GFM01(a) ^ GFM04(a) ^ GFM08(a)) /* a * 0D = a * (01 + 04 + 08) */
#define GFM0E(a) \
    (GFM02(a) ^ GFM04(a) ^ GFM08(a)) /* a * 0E = a * (02 + 04 + 08) */

#else /* RIJNDAEL_GENERATE_VALUES */

/* GF_MULTIPLY
 *
 * multiply two bytes represented in GF(2**8), mod (x**4 + 1)
 */
PRUint8
gfm(PRUint8 a, PRUint8 b)
{
    PRUint8 res = 0;
    while (b > 0) {
        res = (b & 0x01) ? res ^ a : res;
        a = XTIME(a);
        b >>= 1;
    }
    return res;
}

#define GFM01(a) \
    (a) /* a * 01 = a, the identity */
#define GFM02(a) \
    (XTIME(a) & 0xff) /* a * 02 = xtime(a) */
#define GFM03(a) \
    (gfm(a, 0x03)) /* a * 03 */
#define GFM09(a) \
    (gfm(a, 0x09)) /* a * 09 */
#define GFM0B(a) \
    (gfm(a, 0x0B)) /* a * 0B */
#define GFM0D(a) \
    (gfm(a, 0x0D)) /* a * 0D */
#define GFM0E(a) \
    (gfm(a, 0x0E)) /* a * 0E */

#endif /* choosing GFM function */

/*
 * The T-tables
 */
#define G_T0(i) \
    (WORD4(GFM02(SBOX(i)), GFM01(SBOX(i)), GFM01(SBOX(i)), GFM03(SBOX(i))))
#define G_T1(i) \
    (WORD4(GFM03(SBOX(i)), GFM02(SBOX(i)), GFM01(SBOX(i)), GFM01(SBOX(i))))
#define G_T2(i) \
    (WORD4(GFM01(SBOX(i)), GFM03(SBOX(i)), GFM02(SBOX(i)), GFM01(SBOX(i))))
#define G_T3(i) \
    (WORD4(GFM01(SBOX(i)), GFM01(SBOX(i)), GFM03(SBOX(i)), GFM02(SBOX(i))))

/*
 * The inverse T-tables
 */
#define G_TInv0(i) \
    (WORD4(GFM0E(SINV(i)), GFM09(SINV(i)), GFM0D(SINV(i)), GFM0B(SINV(i))))
#define G_TInv1(i) \
    (WORD4(GFM0B(SINV(i)), GFM0E(SINV(i)), GFM09(SINV(i)), GFM0D(SINV(i))))
#define G_TInv2(i) \
    (WORD4(GFM0D(SINV(i)), GFM0B(SINV(i)), GFM0E(SINV(i)), GFM09(SINV(i))))
#define G_TInv3(i) \
    (WORD4(GFM09(SINV(i)), GFM0D(SINV(i)), GFM0B(SINV(i)), GFM0E(SINV(i))))

/*
 * The inverse mix column tables
 */
#define G_IMXC0(i) \
    (WORD4(GFM0E(i), GFM09(i), GFM0D(i), GFM0B(i)))
#define G_IMXC1(i) \
    (WORD4(GFM0B(i), GFM0E(i), GFM09(i), GFM0D(i)))
#define G_IMXC2(i) \
    (WORD4(GFM0D(i), GFM0B(i), GFM0E(i), GFM09(i)))
#define G_IMXC3(i) \
    (WORD4(GFM09(i), GFM0D(i), GFM0B(i), GFM0E(i)))

/* Now choose the T-table indexing method */
#if defined(RIJNDAEL_GENERATE_VALUES)
/* generate values for the tables with a function*/
static PRUint32
gen_TInvXi(PRUint8 tx, PRUint8 i)
{
    PRUint8 si01, si02, si03, si04, si08, si09, si0B, si0D, si0E;
    si01 = SINV(i);
    si02 = XTIME(si01);
    si04 = XTIME(si02);
    si08 = XTIME(si04);
    si03 = si02 ^ si01;
    si09 = si08 ^ si01;
    si0B = si08 ^ si03;
    si0D = si09 ^ si04;
    si0E = si08 ^ si04 ^ si02;
    switch (tx) {
        case 0:
            return WORD4(si0E, si09, si0D, si0B);
        case 1:
            return WORD4(si0B, si0E, si09, si0D);
        case 2:
            return WORD4(si0D, si0B, si0E, si09);
        case 3:
            return WORD4(si09, si0D, si0B, si0E);
    }
    return -1;
}
#define T0(i) G_T0(i)
#define T1(i) G_T1(i)
#define T2(i) G_T2(i)
#define T3(i) G_T3(i)
#define TInv0(i) gen_TInvXi(0, i)
#define TInv1(i) gen_TInvXi(1, i)
#define TInv2(i) gen_TInvXi(2, i)
#define TInv3(i) gen_TInvXi(3, i)
#define IMXC0(b) G_IMXC0(b)
#define IMXC1(b) G_IMXC1(b)
#define IMXC2(b) G_IMXC2(b)
#define IMXC3(b) G_IMXC3(b)
#else /* RIJNDAEL_GENERATE_VALUES_MACRO */
/* generate values for the tables with macros */
#define T0(i) G_T0(i)
#define T1(i) G_T1(i)
#define T2(i) G_T2(i)
#define T3(i) G_T3(i)
#define TInv0(i) G_TInv0(i)
#define TInv1(i) G_TInv1(i)
#define TInv2(i) G_TInv2(i)
#define TInv3(i) G_TInv3(i)
#define IMXC0(b) G_IMXC0(b)
#define IMXC1(b) G_IMXC1(b)
#define IMXC2(b) G_IMXC2(b)
#define IMXC3(b) G_IMXC3(b)
#endif /* choose T-table indexing method */

#endif /* not RIJNDAEL_INCLUDE_TABLES */

/**************************************************************************
 *
 * Stuff related to the Rijndael key schedule
 *
 *************************************************************************/

#define SUBBYTE(w)                                \
    ((((PRUint32)SBOX((w >> 24) & 0xff)) << 24) | \
     (((PRUint32)SBOX((w >> 16) & 0xff)) << 16) | \
     (((PRUint32)SBOX((w >> 8) & 0xff)) << 8) |   \
     (((PRUint32)SBOX((w)&0xff))))

#ifdef IS_LITTLE_ENDIAN
#define ROTBYTE(b) \
    ((b >> 8) | (b << 24))
#else
#define ROTBYTE(b) \
    ((b << 8) | (b >> 24))
#endif

/* rijndael_key_expansion7
 *
 * Generate the expanded key from the key input by the user.
 * XXX
 * Nk == 7 (224 key bits) is a weird case.  Since Nk > 6, an added SubByte
 * transformation is done periodically.  The period is every 4 bytes, and
 * since 7%4 != 0 this happens at different times for each key word (unlike
 * Nk == 8 where it happens twice in every key word, in the same positions).
 * For now, I'm implementing this case "dumbly", w/o any unrolling.
 */
static void
rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int Nk)
{
    unsigned int i;
    PRUint32 *W;
    PRUint32 *pW;
    PRUint32 tmp;
    W = cx->expandedKey;
    /* 1.  the first Nk words contain the cipher key */
    memcpy(W, key, Nk * 4);
    i = Nk;
    /* 2.  loop until full expanded key is obtained */
    pW = W + i - 1;
    for (; i < cx->Nb * (cx->Nr + 1); ++i) {
        tmp = *pW++;
        if (i % Nk == 0)
            tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
        else if (i % Nk == 4)
            tmp = SUBBYTE(tmp);
        *pW = W[i - Nk] ^ tmp;
    }
}

/* rijndael_key_expansion
 *
 * Generate the expanded key from the key input by the user.
 */
static void
rijndael_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
{
    unsigned int i;
    PRUint32 *W;
    PRUint32 *pW;
    PRUint32 tmp;
    unsigned int round_key_words = cx->Nb * (cx->Nr + 1);
    if (Nk == 7) {
        rijndael_key_expansion7(cx, key, Nk);
        return;
    }
    W = cx->expandedKey;
    /* The first Nk words contain the input cipher key */
    memcpy(W, key, Nk * 4);
    i = Nk;
    pW = W + i - 1;
    /* Loop over all sets of Nk words, except the last */
    while (i < round_key_words - Nk) {
        tmp = *pW++;
        tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
        *pW = W[i++ - Nk] ^ tmp;
        tmp = *pW++;
        *pW = W[i++ - Nk] ^ tmp;
        tmp = *pW++;
        *pW = W[i++ - Nk] ^ tmp;
        tmp = *pW++;
        *pW = W[i++ - Nk] ^ tmp;
        if (Nk == 4)
            continue;
        switch (Nk) {
            case 8:
                tmp = *pW++;
                tmp = SUBBYTE(tmp);
                *pW = W[i++ - Nk] ^ tmp;
            case 7:
                tmp = *pW++;
                *pW = W[i++ - Nk] ^ tmp;
            case 6:
                tmp = *pW++;
                *pW = W[i++ - Nk] ^ tmp;
            case 5:
                tmp = *pW++;
                *pW = W[i++ - Nk] ^ tmp;
        }
    }
    /* Generate the last word */
    tmp = *pW++;
    tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
    *pW = W[i++ - Nk] ^ tmp;
    /* There may be overflow here, if Nk % (Nb * (Nr + 1)) > 0.  However,
     * since the above loop generated all but the last Nk key words, there
     * is no more need for the SubByte transformation.
     */
    if (Nk < 8) {
        for (; i < round_key_words; ++i) {
            tmp = *pW++;
            *pW = W[i - Nk] ^ tmp;
        }
    } else {
        /* except in the case when Nk == 8.  Then one more SubByte may have
         * to be performed, at i % Nk == 4.
         */
        for (; i < round_key_words; ++i) {
            tmp = *pW++;
            if (i % Nk == 4)
                tmp = SUBBYTE(tmp);
            *pW = W[i - Nk] ^ tmp;
        }
    }
}

/* rijndael_invkey_expansion
 *
 * Generate the expanded key for the inverse cipher from the key input by
 * the user.
 */
static void
rijndael_invkey_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
{
    unsigned int r;
    PRUint32 *roundkeyw;
    PRUint8 *b;
    int Nb = cx->Nb;
    /* begins like usual key expansion ... */
    rijndael_key_expansion(cx, key, Nk);
    /* ... but has the additional step of InvMixColumn,
     * excepting the first and last round keys.
     */
    roundkeyw = cx->expandedKey + cx->Nb;
    for (r = 1; r < cx->Nr; ++r) {
        /* each key word, roundkeyw, represents a column in the key
         * matrix.  Each column is multiplied by the InvMixColumn matrix.
         *   [ 0E 0B 0D 09 ]   [ b0 ]
         *   [ 09 0E 0B 0D ] * [ b1 ]
         *   [ 0D 09 0E 0B ]   [ b2 ]
         *   [ 0B 0D 09 0E ]   [ b3 ]
         */
        b = (PRUint8 *)roundkeyw;
        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
        b = (PRUint8 *)roundkeyw;
        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
        b = (PRUint8 *)roundkeyw;
        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
        b = (PRUint8 *)roundkeyw;
        *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^ IMXC2(b[2]) ^ IMXC3(b[3]);
        if (Nb <= 4)
            continue;
        switch (Nb) {
            case 8:
                b = (PRUint8 *)roundkeyw;
                *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^
                               IMXC2(b[2]) ^ IMXC3(b[3]);
            case 7:
                b = (PRUint8 *)roundkeyw;
                *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^
                               IMXC2(b[2]) ^ IMXC3(b[3]);
            case 6:
                b = (PRUint8 *)roundkeyw;
                *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^
                               IMXC2(b[2]) ^ IMXC3(b[3]);
            case 5:
                b = (PRUint8 *)roundkeyw;
                *roundkeyw++ = IMXC0(b[0]) ^ IMXC1(b[1]) ^
                               IMXC2(b[2]) ^ IMXC3(b[3]);
        }
    }
}

/**************************************************************************
 *
 * Stuff related to Rijndael encryption/decryption.
 *
 *************************************************************************/

#ifdef IS_LITTLE_ENDIAN
#define BYTE0WORD(w) ((w)&0x000000ff)
#define BYTE1WORD(w) ((w)&0x0000ff00)
#define BYTE2WORD(w) ((w)&0x00ff0000)
#define BYTE3WORD(w) ((w)&0xff000000)
#else
#define BYTE0WORD(w) ((w)&0xff000000)
#define BYTE1WORD(w) ((w)&0x00ff0000)
#define BYTE2WORD(w) ((w)&0x0000ff00)
#define BYTE3WORD(w) ((w)&0x000000ff)
#endif

typedef union {
    PRUint32 w[4];
    PRUint8 b[16];
} rijndael_state;

#define COLUMN_0(state) state.w[0]
#define COLUMN_1(state) state.w[1]
#define COLUMN_2(state) state.w[2]
#define COLUMN_3(state) state.w[3]

#define STATE_BYTE(i) state.b[i]

static void NO_SANITIZE_ALIGNMENT
rijndael_encryptBlock128(AESContext *cx,
                         unsigned char *output,
                         const unsigned char *input)
{
    unsigned int r;
    PRUint32 *roundkeyw;
    rijndael_state state;
    PRUint32 C0, C1, C2, C3;
#if defined(NSS_X86_OR_X64)
#define pIn input
#define pOut output
#else
    unsigned char *pIn, *pOut;
    PRUint32 inBuf[4], outBuf[4];

    if ((ptrdiff_t)input & 0x3) {
        memcpy(inBuf, input, sizeof inBuf);
        pIn = (unsigned char *)inBuf;
    } else {
        pIn = (unsigned char *)input;
    }
    if ((ptrdiff_t)output & 0x3) {
        pOut = (unsigned char *)outBuf;
    } else {
        pOut = (unsigned char *)output;
    }
#endif
    roundkeyw = cx->expandedKey;
    /* Step 1: Add Round Key 0 to initial state */
    COLUMN_0(state) = *((PRUint32 *)(pIn)) ^ *roundkeyw++;
    COLUMN_1(state) = *((PRUint32 *)(pIn + 4)) ^ *roundkeyw++;
    COLUMN_2(state) = *((PRUint32 *)(pIn + 8)) ^ *roundkeyw++;
    COLUMN_3(state) = *((PRUint32 *)(pIn + 12)) ^ *roundkeyw++;
    /* Step 2: Loop over rounds [1..NR-1] */
    for (r = 1; r < cx->Nr; ++r) {
        /* Do ShiftRow, ByteSub, and MixColumn all at once */
        C0 = T0(STATE_BYTE(0)) ^
             T1(STATE_BYTE(5)) ^
             T2(STATE_BYTE(10)) ^
             T3(STATE_BYTE(15));
        C1 = T0(STATE_BYTE(4)) ^
             T1(STATE_BYTE(9)) ^
             T2(STATE_BYTE(14)) ^
             T3(STATE_BYTE(3));
        C2 = T0(STATE_BYTE(8)) ^
             T1(STATE_BYTE(13)) ^
             T2(STATE_BYTE(2)) ^
             T3(STATE_BYTE(7));
        C3 = T0(STATE_BYTE(12)) ^
             T1(STATE_BYTE(1)) ^
             T2(STATE_BYTE(6)) ^
             T3(STATE_BYTE(11));
        /* Round key addition */
        COLUMN_0(state) = C0 ^ *roundkeyw++;
        COLUMN_1(state) = C1 ^ *roundkeyw++;
        COLUMN_2(state) = C2 ^ *roundkeyw++;
        COLUMN_3(state) = C3 ^ *roundkeyw++;
    }
    /* Step 3: Do the last round */
    /* Final round does not employ MixColumn */
    C0 = ((BYTE0WORD(T2(STATE_BYTE(0)))) |
          (BYTE1WORD(T3(STATE_BYTE(5)))) |
          (BYTE2WORD(T0(STATE_BYTE(10)))) |
          (BYTE3WORD(T1(STATE_BYTE(15))))) ^
         *roundkeyw++;
    C1 = ((BYTE0WORD(T2(STATE_BYTE(4)))) |
          (BYTE1WORD(T3(STATE_BYTE(9)))) |
          (BYTE2WORD(T0(STATE_BYTE(14)))) |
          (BYTE3WORD(T1(STATE_BYTE(3))))) ^
         *roundkeyw++;
    C2 = ((BYTE0WORD(T2(STATE_BYTE(8)))) |
          (BYTE1WORD(T3(STATE_BYTE(13)))) |
          (BYTE2WORD(T0(STATE_BYTE(2)))) |
          (BYTE3WORD(T1(STATE_BYTE(7))))) ^
         *roundkeyw++;
    C3 = ((BYTE0WORD(T2(STATE_BYTE(12)))) |
          (BYTE1WORD(T3(STATE_BYTE(1)))) |
          (BYTE2WORD(T0(STATE_BYTE(6)))) |
          (BYTE3WORD(T1(STATE_BYTE(11))))) ^
         *roundkeyw++;
    *((PRUint32 *)pOut) = C0;
    *((PRUint32 *)(pOut + 4)) = C1;
    *((PRUint32 *)(pOut + 8)) = C2;
    *((PRUint32 *)(pOut + 12)) = C3;
#if defined(NSS_X86_OR_X64)
#undef pIn
#undef pOut
#else
    if ((ptrdiff_t)output & 0x3) {
        memcpy(output, outBuf, sizeof outBuf);
    }
#endif
}

static SECStatus NO_SANITIZE_ALIGNMENT
rijndael_decryptBlock128(AESContext *cx,
                         unsigned char *output,
                         const unsigned char *input)
{
    int r;
    PRUint32 *roundkeyw;
    rijndael_state state;
    PRUint32 C0, C1, C2, C3;
#if defined(NSS_X86_OR_X64)
#define pIn input
#define pOut output
#else
    unsigned char *pIn, *pOut;
    PRUint32 inBuf[4], outBuf[4];

    if ((ptrdiff_t)input & 0x3) {
        memcpy(inBuf, input, sizeof inBuf);
        pIn = (unsigned char *)inBuf;
    } else {
        pIn = (unsigned char *)input;
    }
    if ((ptrdiff_t)output & 0x3) {
        pOut = (unsigned char *)outBuf;
    } else {
        pOut = (unsigned char *)output;
    }
#endif
    roundkeyw = cx->expandedKey + cx->Nb * cx->Nr + 3;
    /* reverse the final key addition */
    COLUMN_3(state) = *((PRUint32 *)(pIn + 12)) ^ *roundkeyw--;
    COLUMN_2(state) = *((PRUint32 *)(pIn + 8)) ^ *roundkeyw--;
    COLUMN_1(state) = *((PRUint32 *)(pIn + 4)) ^ *roundkeyw--;
    COLUMN_0(state) = *((PRUint32 *)(pIn)) ^ *roundkeyw--;
    /* Loop over rounds in reverse [NR..1] */
    for (r = cx->Nr; r > 1; --r) {
        /* Invert the (InvByteSub*InvMixColumn)(InvShiftRow(state)) */
        C0 = TInv0(STATE_BYTE(0)) ^
             TInv1(STATE_BYTE(13)) ^
             TInv2(STATE_BYTE(10)) ^
             TInv3(STATE_BYTE(7));
        C1 = TInv0(STATE_BYTE(4)) ^
             TInv1(STATE_BYTE(1)) ^
             TInv2(STATE_BYTE(14)) ^
             TInv3(STATE_BYTE(11));
        C2 = TInv0(STATE_BYTE(8)) ^
             TInv1(STATE_BYTE(5)) ^
             TInv2(STATE_BYTE(2)) ^
             TInv3(STATE_BYTE(15));
        C3 = TInv0(STATE_BYTE(12)) ^
             TInv1(STATE_BYTE(9)) ^
             TInv2(STATE_BYTE(6)) ^
             TInv3(STATE_BYTE(3));
        /* Invert the key addition step */
        COLUMN_3(state) = C3 ^ *roundkeyw--;
        COLUMN_2(state) = C2 ^ *roundkeyw--;
        COLUMN_1(state) = C1 ^ *roundkeyw--;
        COLUMN_0(state) = C0 ^ *roundkeyw--;
    }
    /* inverse sub */
    pOut[0] = SINV(STATE_BYTE(0));
    pOut[1] = SINV(STATE_BYTE(13));
    pOut[2] = SINV(STATE_BYTE(10));
    pOut[3] = SINV(STATE_BYTE(7));
    pOut[4] = SINV(STATE_BYTE(4));
    pOut[5] = SINV(STATE_BYTE(1));
    pOut[6] = SINV(STATE_BYTE(14));
    pOut[7] = SINV(STATE_BYTE(11));
    pOut[8] = SINV(STATE_BYTE(8));
    pOut[9] = SINV(STATE_BYTE(5));
    pOut[10] = SINV(STATE_BYTE(2));
    pOut[11] = SINV(STATE_BYTE(15));
    pOut[12] = SINV(STATE_BYTE(12));
    pOut[13] = SINV(STATE_BYTE(9));
    pOut[14] = SINV(STATE_BYTE(6));
    pOut[15] = SINV(STATE_BYTE(3));
    /* final key addition */
    *((PRUint32 *)(pOut + 12)) ^= *roundkeyw--;
    *((PRUint32 *)(pOut + 8)) ^= *roundkeyw--;
    *((PRUint32 *)(pOut + 4)) ^= *roundkeyw--;
    *((PRUint32 *)pOut) ^= *roundkeyw--;
#if defined(NSS_X86_OR_X64)
#undef pIn
#undef pOut
#else
    if ((ptrdiff_t)output & 0x3) {
        memcpy(output, outBuf, sizeof outBuf);
    }
#endif
    return SECSuccess;
}

/**************************************************************************
 *
 *  Rijndael modes of operation (ECB and CBC)
 *
 *************************************************************************/

static SECStatus
rijndael_encryptECB(AESContext *cx, unsigned char *output,
                    unsigned int *outputLen, unsigned int maxOutputLen,
                    const unsigned char *input, unsigned int inputLen)
{
    AESBlockFunc *encryptor;

    if (aesni_support()) {
        /* Use hardware acceleration for normal AES parameters. */
        encryptor = &rijndael_native_encryptBlock;
    } else {
        encryptor = &rijndael_encryptBlock128;
    }
    while (inputLen > 0) {
        (*encryptor)(cx, output, input);
        output += AES_BLOCK_SIZE;
        input += AES_BLOCK_SIZE;
        inputLen -= AES_BLOCK_SIZE;
    }
    return SECSuccess;
}

static SECStatus
rijndael_encryptCBC(AESContext *cx, unsigned char *output,
                    unsigned int *outputLen, unsigned int maxOutputLen,
                    const unsigned char *input, unsigned int inputLen)
{
    unsigned int j;
    unsigned char *lastblock;
    unsigned char inblock[AES_BLOCK_SIZE * 8];

    if (!inputLen)
        return SECSuccess;
    lastblock = cx->iv;
    while (inputLen > 0) {
        /* XOR with the last block (IV if first block) */
        for (j = 0; j < AES_BLOCK_SIZE; ++j) {
            inblock[j] = input[j] ^ lastblock[j];
        }
        /* encrypt */
        rijndael_encryptBlock128(cx, output, inblock);
        /* move to the next block */
        lastblock = output;
        output += AES_BLOCK_SIZE;
        input += AES_BLOCK_SIZE;
        inputLen -= AES_BLOCK_SIZE;
    }
    memcpy(cx->iv, lastblock, AES_BLOCK_SIZE);
    return SECSuccess;
}

static SECStatus
rijndael_decryptECB(AESContext *cx, unsigned char *output,
                    unsigned int *outputLen, unsigned int maxOutputLen,
                    const unsigned char *input, unsigned int inputLen)
{
    while (inputLen > 0) {
        if (rijndael_decryptBlock128(cx, output, input) != SECSuccess) {
            return SECFailure;
        }
        output += AES_BLOCK_SIZE;
        input += AES_BLOCK_SIZE;
        inputLen -= AES_BLOCK_SIZE;
    }
    return SECSuccess;
}

static SECStatus
rijndael_decryptCBC(AESContext *cx, unsigned char *output,
                    unsigned int *outputLen, unsigned int maxOutputLen,
                    const unsigned char *input, unsigned int inputLen)
{
    const unsigned char *in;
    unsigned char *out;
    unsigned int j;
    unsigned char newIV[AES_BLOCK_SIZE];

    if (!inputLen)
        return SECSuccess;
    PORT_Assert(output - input >= 0 || input - output >= (int)inputLen);
    in = input + (inputLen - AES_BLOCK_SIZE);
    memcpy(newIV, in, AES_BLOCK_SIZE);
    out = output + (inputLen - AES_BLOCK_SIZE);
    while (inputLen > AES_BLOCK_SIZE) {
        if (rijndael_decryptBlock128(cx, out, in) != SECSuccess) {
            return SECFailure;
        }
        for (j = 0; j < AES_BLOCK_SIZE; ++j)
            out[j] ^= in[(int)(j - AES_BLOCK_SIZE)];
        out -= AES_BLOCK_SIZE;
        in -= AES_BLOCK_SIZE;
        inputLen -= AES_BLOCK_SIZE;
    }
    if (in == input) {
        if (rijndael_decryptBlock128(cx, out, in) != SECSuccess) {
            return SECFailure;
        }
        for (j = 0; j < AES_BLOCK_SIZE; ++j)
            out[j] ^= cx->iv[j];
    }
    memcpy(cx->iv, newIV, AES_BLOCK_SIZE);
    return SECSuccess;
}

/************************************************************************
 *
 * BLAPI Interface functions
 *
 * The following functions implement the encryption routines defined in
 * BLAPI for the AES cipher, Rijndael.
 *
 ***********************************************************************/

AESContext *
AES_AllocateContext(void)
{
    return PORT_ZNewAligned(AESContext, 16, mem);
}

/*
** Initialize a new AES context suitable for AES encryption/decryption in
** the ECB or CBC mode.
**  "mode" the mode of operation, which must be NSS_AES or NSS_AES_CBC
*/
static SECStatus
aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
                const unsigned char *iv, int mode, unsigned int encrypt)
{
    unsigned int Nk;
    PRBool use_hw_aes;
    /* According to AES, block lengths are 128 and key lengths are 128, 192, or
     * 256 bits. We support other key sizes as well [128, 256] as long as the
     * length in bytes is divisible by 4.
     */

    if (key == NULL ||
        keysize < AES_BLOCK_SIZE ||
        keysize > 32 ||
        keysize % 4 != 0) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    if (mode != NSS_AES && mode != NSS_AES_CBC) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    if (mode == NSS_AES_CBC && iv == NULL) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    if (!cx) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    use_hw_aes = aesni_support() && (keysize % 8) == 0;
    /* Nb = (block size in bits) / 32 */
    cx->Nb = AES_BLOCK_SIZE / 4;
    /* Nk = (key size in bits) / 32 */
    Nk = keysize / 4;
    /* Obtain number of rounds from "table" */
    cx->Nr = RIJNDAEL_NUM_ROUNDS(Nk, cx->Nb);
    /* copy in the iv, if neccessary */
    if (mode == NSS_AES_CBC) {
        memcpy(cx->iv, iv, AES_BLOCK_SIZE);
#ifdef USE_HW_AES
        if (use_hw_aes) {
            cx->worker = (freeblCipherFunc)
                intel_aes_cbc_worker(encrypt, keysize);
        } else
#endif
        {
            cx->worker = (freeblCipherFunc)(encrypt
                                                ? &rijndael_encryptCBC
                                                : &rijndael_decryptCBC);
        }
    } else {
#ifdef USE_HW_AES
        if (use_hw_aes) {
            cx->worker = (freeblCipherFunc)
                intel_aes_ecb_worker(encrypt, keysize);
        } else
#endif
        {
            cx->worker = (freeblCipherFunc)(encrypt
                                                ? &rijndael_encryptECB
                                                : &rijndael_decryptECB);
        }
    }
    PORT_Assert((cx->Nb * (cx->Nr + 1)) <= RIJNDAEL_MAX_EXP_KEY_SIZE);
    if ((cx->Nb * (cx->Nr + 1)) > RIJNDAEL_MAX_EXP_KEY_SIZE) {
        PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
        return SECFailure;
    }
#ifdef USE_HW_AES
    if (use_hw_aes) {
        intel_aes_init(encrypt, keysize);
    } else
#endif
    {
        /* Generate expanded key */
        if (encrypt) {
            if (use_hw_aes && (cx->mode == NSS_AES_GCM || cx->mode == NSS_AES ||
                               cx->mode == NSS_AES_CTR)) {
                PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32);
                /* Prepare hardware key for normal AES parameters. */
                rijndael_native_key_expansion(cx, key, Nk);
            } else {
                rijndael_key_expansion(cx, key, Nk);
            }
        } else {
            rijndael_invkey_expansion(cx, key, Nk);
        }
    }
    cx->worker_cx = cx;
    cx->destroy = NULL;
    cx->isBlock = PR_TRUE;
    return SECSuccess;
}

SECStatus
AES_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
                const unsigned char *iv, int mode, unsigned int encrypt,
                unsigned int blocksize)
{
    int basemode = mode;
    PRBool baseencrypt = encrypt;
    SECStatus rv;

    if (blocksize != AES_BLOCK_SIZE) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }

    switch (mode) {
        case NSS_AES_CTS:
            basemode = NSS_AES_CBC;
            break;
        case NSS_AES_GCM:
        case NSS_AES_CTR:
            basemode = NSS_AES;
            baseencrypt = PR_TRUE;
            break;
    }
    /* Make sure enough is initialized so we can safely call Destroy. */
    cx->worker_cx = NULL;
    cx->destroy = NULL;
    cx->mode = mode;
    rv = aes_InitContext(cx, key, keysize, iv, basemode, baseencrypt);
    if (rv != SECSuccess) {
        AES_DestroyContext(cx, PR_FALSE);
        return rv;
    }

    /* finally, set up any mode specific contexts */
    switch (mode) {
        case NSS_AES_CTS:
            cx->worker_cx = CTS_CreateContext(cx, cx->worker, iv);
            cx->worker = (freeblCipherFunc)(encrypt ? CTS_EncryptUpdate : CTS_DecryptUpdate);
            cx->destroy = (freeblDestroyFunc)CTS_DestroyContext;
            cx->isBlock = PR_FALSE;
            break;
        case NSS_AES_GCM:
#if defined(INTEL_GCM) && defined(USE_HW_AES)
            if (aesni_support() && (keysize % 8) == 0 && avx_support() &&
                clmul_support()) {
                cx->worker_cx = intel_AES_GCM_CreateContext(cx, cx->worker, iv);
                cx->worker = (freeblCipherFunc)(encrypt ? intel_AES_GCM_EncryptUpdate
                                                        : intel_AES_GCM_DecryptUpdate);
                cx->destroy = (freeblDestroyFunc)intel_AES_GCM_DestroyContext;
                cx->isBlock = PR_FALSE;
            } else
#endif
            {
                cx->worker_cx = GCM_CreateContext(cx, cx->worker, iv);
                cx->worker = (freeblCipherFunc)(encrypt ? GCM_EncryptUpdate
                                                        : GCM_DecryptUpdate);
                cx->destroy = (freeblDestroyFunc)GCM_DestroyContext;
                cx->isBlock = PR_FALSE;
            }
            break;
        case NSS_AES_CTR:
            cx->worker_cx = CTR_CreateContext(cx, cx->worker, iv);
#if defined(USE_HW_AES) && defined(_MSC_VER)
            if (aesni_support() && (keysize % 8) == 0) {
                cx->worker = (freeblCipherFunc)CTR_Update_HW_AES;
            } else
#endif
            {
                cx->worker = (freeblCipherFunc)CTR_Update;
            }
            cx->destroy = (freeblDestroyFunc)CTR_DestroyContext;
            cx->isBlock = PR_FALSE;
            break;
        default:
            /* everything has already been set up by aes_InitContext, just
             * return */
            return SECSuccess;
    }
    /* check to see if we succeeded in getting the worker context */
    if (cx->worker_cx == NULL) {
        /* no, just destroy the existing context */
        cx->destroy = NULL; /* paranoia, though you can see a dozen lines */
                            /* below that this isn't necessary */
        AES_DestroyContext(cx, PR_FALSE);
        return SECFailure;
    }
    return SECSuccess;
}

/* AES_CreateContext
 *
 * create a new context for Rijndael operations
 */
AESContext *
AES_CreateContext(const unsigned char *key, const unsigned char *iv,
                  int mode, int encrypt,
                  unsigned int keysize, unsigned int blocksize)
{
    AESContext *cx = AES_AllocateContext();
    if (cx) {
        SECStatus rv = AES_InitContext(cx, key, keysize, iv, mode, encrypt,
                                       blocksize);
        if (rv != SECSuccess) {
            AES_DestroyContext(cx, PR_TRUE);
            cx = NULL;
        }
    }
    return cx;
}

/*
 * AES_DestroyContext
 *
 * Zero an AES cipher context.  If freeit is true, also free the pointer
 * to the context.
 */
void
AES_DestroyContext(AESContext *cx, PRBool freeit)
{
    if (cx->worker_cx && cx->destroy) {
        (*cx->destroy)(cx->worker_cx, PR_TRUE);
        cx->worker_cx = NULL;
        cx->destroy = NULL;
    }
    if (freeit) {
        PORT_Free(cx->mem);
    }
}

/*
 * AES_Encrypt
 *
 * Encrypt an arbitrary-length buffer.  The output buffer must already be
 * allocated to at least inputLen.
 */
SECStatus
AES_Encrypt(AESContext *cx, unsigned char *output,
            unsigned int *outputLen, unsigned int maxOutputLen,
            const unsigned char *input, unsigned int inputLen)
{
    /* Check args */
    if (cx == NULL || output == NULL || (input == NULL && inputLen != 0)) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    if (cx->isBlock && (inputLen % AES_BLOCK_SIZE != 0)) {
        PORT_SetError(SEC_ERROR_INPUT_LEN);
        return SECFailure;
    }
    if (maxOutputLen < inputLen) {
        PORT_SetError(SEC_ERROR_OUTPUT_LEN);
        return SECFailure;
    }
    *outputLen = inputLen;
#if UINT_MAX > MP_32BIT_MAX
    /*
     * we can guarentee that GSM won't overlfow if we limit the input to
     * 2^36 bytes. For simplicity, we are limiting it to 2^32 for now.
     *
     * We do it here to cover both hardware and software GCM operations.
     */
    {
        PR_STATIC_ASSERT(sizeof(unsigned int) > 4);
    }
    if ((cx->mode == NSS_AES_GCM) && (inputLen > MP_32BIT_MAX)) {
        PORT_SetError(SEC_ERROR_OUTPUT_LEN);
        return SECFailure;
    }
#else
    /* if we can't pass in a 32_bit number, then no such check needed */
    {
        PR_STATIC_ASSERT(sizeof(unsigned int) <= 4);
    }
#endif

    return (*cx->worker)(cx->worker_cx, output, outputLen, maxOutputLen,
                         input, inputLen, AES_BLOCK_SIZE);
}

/*
 * AES_Decrypt
 *
 * Decrypt and arbitrary-length buffer.  The output buffer must already be
 * allocated to at least inputLen.
 */
SECStatus
AES_Decrypt(AESContext *cx, unsigned char *output,
            unsigned int *outputLen, unsigned int maxOutputLen,
            const unsigned char *input, unsigned int inputLen)
{
    /* Check args */
    if (cx == NULL || output == NULL || (input == NULL && inputLen != 0)) {
        PORT_SetError(SEC_ERROR_INVALID_ARGS);
        return SECFailure;
    }
    if (cx->isBlock && (inputLen % AES_BLOCK_SIZE != 0)) {
        PORT_SetError(SEC_ERROR_INPUT_LEN);
        return SECFailure;
    }
    if (maxOutputLen < inputLen) {
        PORT_SetError(SEC_ERROR_OUTPUT_LEN);
        return SECFailure;
    }
    *outputLen = inputLen;
    return (*cx->worker)(cx->worker_cx, output, outputLen, maxOutputLen,
                         input, inputLen, AES_BLOCK_SIZE);
}